Commit 60e4ecde authored by numeroteca's avatar numeroteca
Browse files

process all telediarios + adds as background lines + calcualtes prominence based in Watt 1993

parent 71c3bf19
......@@ -5,7 +5,31 @@
library(jsonlite)
library(tidyverse)
# path to files
# functions -------
# function for double axis
# via https://dmitrijskass.netlify.app/2019/06/30/multi-level-labels-with-ggplot2/
# Set locale to english with
Sys.setlocale("LC_TIME", "C")
format_dates <- function(x) {
months <- strftime(x, format = "%m") # Abbreviated name of the month.
years <- lubridate::year(x) # Year as a 4-digit number.
if_else(is.na(lag(years)) | lag(years) != years, # Conditions for pasting.
true = paste(months, years, sep = "\n"),
false = months)
}
format_dates_days <- function(x) {
days <- strftime(x, format = "%d") # day.
months <- strftime(x, format = "%b") # Year as a 4-digit number.
if_else(is.na(lag(months)) | lag(months) != months, # Conditions for pasting.
true = paste(days, months, sep = "\n"),
false = days)
}
# path to files---------
path <- "~/data/verba/data/original/cooked/"
# load 1 file -----
......@@ -45,18 +69,31 @@ for ( i in 1:length(files) ) {
}
# save data
saveRDS(telediarios, file = "~/data/verba/data/output/telediarios.rds")
# load data -----
telediarios <- readRDS(file = "~/data/verba/data/output/telediarios.rds")
# process data -----------
telediarios <- telediarios %>% mutate(
hour = substring(start_time,1,2) %>% as.numeric(),
min = substring( start_time, 4,5) %>% as.numeric(),
sec = substring( start_time, 7,8) %>% as.numeric(),
date = as.Date(substring( datetime,1,10), "%d-%m-%Y"),
telediario = paste0("T",substring(datetime,12,13))
)
) %>% mutate(
# simplifies times when times are not the standard T15 and T21.
# Transforms T22 and T20 to T21, and T14 to T15.
# T15 is 15:00h program and T21 is 21:00h program
telediario = ifelse( telediario == "T14", "T15",telediario),
telediario = ifelse( telediario == "T16", "T15",telediario),
telediario = ifelse( telediario == "T20", "T21",telediario),
telediario = ifelse( telediario == "T22", "T21",telediario),
telediario = ifelse( telediario == "T15", "Telediario 15h",telediario),
telediario = ifelse( telediario == "T21", "Telediario 21h",telediario),
) %>% filter ( telediario != "T08" & telediario != "T07")
table(telediarios$hour)
telediarios <- telediarios %>% mutate(
......@@ -64,31 +101,233 @@ telediarios <- telediarios %>% mutate(
seconds = hour*3600 + min*60 + sec
)
# calcualte duration of each Telediario
duration <- telediarios %>% group_by(date, telediario) %>%
summarise(
time = max(seconds)
)
time = max(seconds),
sentences = n()
) %>% mutate(
# simplifies times when times are not the standard T15 and T21.
# Transforms T22 and T20 to T21, and T14 to T15.
# T15 is 15:00h program and T21 is 21:00h program
telediario = ifelse( telediario == "T14", "T15",telediario),
telediario = ifelse( telediario == "T16", "T15",telediario),
telediario = ifelse( telediario == "T20", "T21",telediario),
telediario = ifelse( telediario == "T22", "T21",telediario),
telediario = ifelse( telediario == "T15", "Telediario 15h",telediario),
telediario = ifelse( telediario == "T21", "Telediario 21h",telediario),
) %>% filter ( telediario != "T08" & telediario != "T07")
hist(duration$time)
plot(duration$time)
duration <- duration %>% mutate(
weekday = weekdays( date) %>% as.factor()
)
table(duration$weekday)
# us if not set locale to English
# levels(duration$weekday) <- c("Monday","Thrusday","Friday","Saturday","Sunday","Tuesday","Wednesday" )
duration <- duration %>% mutate(
weekend = ifelse( weekday %in% c("Saturday", "Sunday"), "Saturday and Sunday","Other days")
)
# hist(duration$time)
# plot(duration$time)
# duration all the Teledairios -----
duration %>%
ggplot() +
geom_col( aes( date,time/60, fill=weekend), width=1) +
scale_fill_manual( values=c("#000000","darkred"))+
geom_hline( yintercept = 3600) +
facet_wrap(~telediario, ncol = 1) +
theme_minimal( base_family = "Roboto condensed") +
theme(
legend.position = "top",
panel.grid.minor.x = element_blank(),
# panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
# panel.grid.major.y = element_blank(),
axis.ticks = element_line(color="#888888"),
plot.background = element_rect(fill="white",color="white")
) +
labs(
title = paste("Verba dataset",sep = ""),
subtitle = paste("Availability and length of television newscasts: Telediarios on La 1. RTVE"),
x="",
y="minutes",
caption="Verba. (2022). [HTML]. Fundación Ciudadana Civio. https://github.com/civio/verba (Original work published 2018)"
)+
scale_x_date(
date_breaks = "6 month",
# date_labels = "%m/%y",
labels = format_dates,
# limits = c(as.Date("2013-07-01"),as.Date("2020-01-01")),
limits = c(as.Date("2014-01-01"),as.Date("2020-01-01")),
# limits = c(as.Date("2017-01-10"),as.Date("2018-12-01")),
expand=c(0,0)
) +
coord_cartesian(
ylim= c(0,80)
)
ggsave("img/telediarios/all-telediarios-duration.png",
width=12,
height=6,
dpi=260)
duration %>% mutate(
# simplifies times when times are not the standard T15 and T21.
# Transforms T22 and T20 to T21, and T14 to T15.
# T15 is 15:00h program and T21 is 21:00h program
telediario = ifelse( telediario == "T14", "T15",telediario),
telediario = ifelse( telediario == "T16", "T15",telediario),
telediario = ifelse( telediario == "T20", "T21",telediario),
telediario = ifelse( telediario == "T22", "T21",telediario),
telediario = ifelse( telediario == "T15", "Telediario 15h",telediario),
telediario = ifelse( telediario == "T21", "Telediario 21h",telediario),
) %>% filter ( telediario != "T08" & telediario != "T07") %>%
# duration all the Teledairios lines avg -----
duration %>%
ggplot() +
geom_col( aes( date,time)) +
geom_line( data = . %>% filter( weekday =="Sunday") , aes( date,time/60), size=1, color="#bb3355") +
geom_line( data = . %>% filter( weekday =="Saturday") , aes( date,time/60), size=1, color="orange") +
geom_line( data = . %>% ungroup() %>% filter( ! weekday %in% c("Sunday","Saturday")) %>% group_by(telediario) %>% mutate( time_avg = zoo::rollmean(time,k=10,fill=NA)) ,
aes( date,time_avg/60), color="black", size=1) +
scale_color_manual( values=c("darkred","#000000"))+
geom_hline( yintercept = 3600) +
facet_wrap(~telediario, ncol = 1) +
theme_minimal( base_family = "Roboto condensed") +
theme(
legend.position = "top",
panel.grid.minor.x = element_blank(),
# panel.grid.minor.y = element_blank(),
# panel.grid.major.x = element_blank(),
# panel.grid.major.y = element_blank(),
axis.ticks = element_line(color="#888888"),
plot.background = element_rect(fill="white",color="white")
) +
labs(
title = paste("Duration of television newscasts by day of the week",sep = ""),
subtitle = paste(" 🟥 Sunday 🟧 Saturday ⬛Monday to Friday"),
x="",
y="minutes",
caption="Verba dataset. Monday to Friday are averaged with a rolling window of 14 days"
)+
scale_x_date(
date_breaks = "6 month",
# date_labels = "%m/%y",
labels = format_dates,
# limits = c(as.Date("2013-07-01"),as.Date("2020-01-01")),
limits = c(as.Date("2014-01-01"),as.Date("2020-01-01")),
# limits = c(as.Date("2017-01-10"),as.Date("2018-12-01")),
expand=c(0,0)
) +
coord_cartesian(
ylim= c(0,80)
)
ggsave("img/telediarios/all-telediarios-duration_line-avg.png",
width=12,
height=6,
dpi=260)
# duration Teledairios Cifuentes -----
duration %>% mutate(
) %>%
ggplot() +
geom_col( aes( date,time/60, fill=weekend), width=1) +
scale_fill_manual( values=c("#000000","#999999"))+
# geom_hline( yintercept = 3600) +
facet_wrap(~telediario, ncol = 1) +
theme_minimal( base_family = "Roboto condensed", base_size= 8) +
theme(
legend.position = "top",
panel.grid.minor.x = element_blank(),
# panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
# panel.grid.major.y = element_blank(),
axis.ticks = element_line(color="#888888"),
plot.background = element_rect(fill="white",color="white")
) +
labs(
title = paste("Verba dataset",sep = ""),
subtitle = paste("Availability and length of television newscasts: Telediarios on La 1. RTVE"),
x="2018",
y="minutes",
caption="Verba. (2022). [HTML]. Fundación Ciudadana Civio. https://github.com/civio/verba (Original work published 2018)"
)+
scale_x_date(
limits = c(as.Date("2018-03-10"),as.Date("2018-05-01"))
date_breaks = "1 day",
# date_labels = "%m/%y",
labels = format_dates_days,
expand=c(0,0),
limits = c(as.Date("2018-03-15"),as.Date("2018-05-01"))
) +
coord_cartesian(
# ylim= c(0,70)
)
ggsave("img/telediarios/all-telediarios-duration_cifuentes.png",
width=8,
height=4,
dpi=260)
# When they speak plot Sports--------------
telediarios %>%
filter(
date > "2018-02-10"
& date < "2018-06-01"
# date == "2018-04-01"
# & telediario =="Telediario 15h"
) %>%
ggplot( ) +
# primer minuto y medio (90 segundos)
# geom_rect(aes( xmin=min(data$date), xmax=max(data$date),
# ymin=0,ymax=set_minutes*60/60), alpha = 0.2, fill = "lightgrey"
# ) +
# annotate("text", x= min(data$date)+5, y= set_minutes/2, label=paste("First minutes",set_minutes), family = "Roboto Condensed") +
geom_hline(aes(yintercept=4), size=0.8, color="#bbbbbb") +
geom_segment( aes(x = date, xend = date, y = seconds/60, yend = (seconds+4)/60), alpha = 0.2, size=3, color= "#999999") + #, color=caso
geom_segment( data = . %>% filter( grepl("deportes|Real Madrid|champions|tenis|Camp Nou|Bernabeu|Messi|Cristiano|Simeone|Atlético|fútbol|liga",text) ),
aes(x = date, xend = date, y = seconds/60, yend = (seconds+20)/60), alpha = 0.7, size=3, color= "red") + #, color=caso
# geom_text( aes(x = date,y = seconds/60, label=text), alpha = 0.7, size=3, color= "#000000") + #, color=caso
# geom_text(data = . %>% filter( grepl("deportes",text) ) , aes(x = date,y = seconds/60, label=text), alpha = 0.7, size=3, color= "red") + #, color=caso
geom_hline(aes(yintercept=0), size=0.1) +
# anotate
# geom_curve(aes(x = as.POSIXct("2018-03-24"), y = 0.5, xend = as.POSIXct("2018-03-26"), yend = 7),
# color="#999999", data =data, curvature = -0.2, size = 0.1) +
# annotate(geom = "text", x = as.POSIXct("2018-03-26"), y = 7, label = "Portada (1:30 minutos)",
# family = "Roboto Condensed", hjust = 0,size=6,size=0.6) +
theme_minimal(base_family = "Roboto Condensed", base_size = 9) +
scale_y_continuous(
limits = c(0,75)
) +
scale_x_date(
date_breaks = "2 day",
# minor_breaks = "1 day",
expand = c(0,0),
# limits = c(as.Date("2018-03-10"),as.Date("2018-05-01")),
# limits = c(as.Date("2018-01-10"),as.Date("2018-05-01")),
# labels = format_dates
labels = format_dates_days
) +
labs(title = paste("When they speak about Sports... to find duration of news summary",sep = ""),
subtitle = paste("🟥 Sports related words",subtitle_text),
x = time_text,
y = "minutos since the start",
caption = paste(caption_text, "Highlighed in red are words related to sports" ) ) +
theme(
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
# panel.grid.major.y = element_blank(),
plot.background = element_rect(fill="white",color="white"),
axis.ticks = element_line(color="#888888")
) +
facet_wrap( ~telediario, ncol=1) +
# anotate
geom_curve(aes(x = as.Date("2018-03-24"), y = 4, xend = as.Date("2018-03-26"), yend = 12),
color="#000000", data =data, curvature = -0.2, size = 0.35) +
annotate(geom = "text", x = as.Date("2018-03-26"), y = 12, label = "Sport news at the end of summary (4 minutes)",
family = "Roboto Condensed", hjust = 0, size=4, size=0.6)
ggsave("img/telediarios/all-telediarios-when-they-speak-sports.png",
width=12,
height=7,
dpi=260)
......@@ -67,9 +67,11 @@ if (file.exists( paste0("img/",el_caso) )){
# Load data ---------------
# load online
data <- read.delim( paste0("https://verba.civio.es/api/search.csv?q=", search,"&size=10000"),sep = ",")
# data <- read.delim( paste0("https://verba.civio.es/api/search.csv?q=", search,"&size=10000"),sep = ",")
# load local data
data <- read.delim("data/verba/180320_180430_cifuentes-y-palabras-relacionadas_clasificado.csv",sep = ",")
# data <- read.delim("data/verba/180320_180430_cifuentes-y-palabras-relacionadas_2.csv",sep = ",")
# data <- read.delim("data/verba/180320_180430_cifuentes-y-palabras-relacionadas.csv",sep = ",")
# data <- read.delim("data/verba/180320_180430_cifuentes.csv",sep = ",")
# data <- read.delim("data/verba/140104_140228_barcenas-palabras-relacionadas_01.csv",sep = ",")
......@@ -78,7 +80,6 @@ data <- read.delim("data/verba/180320_180430_cifuentes-y-palabras-relacionadas_c
# Preprocess data -------
data <- data %>% mutate(
# Preprocess data -------
# Transform raw verba data into date format
date = as.Date(programme_date),
# Transforms in to date-time format. TODO: loses hour!
......@@ -98,13 +99,59 @@ data <- data %>% mutate(
) %>% filter ( telediario != "T08" & telediario != "T07")
# Use duration df from all_telediarios.R
data <- left_join(
data %>% mutate( dunique = paste0(telediario,date ) ),
duration %>% mutate( dunique = paste0(telediario,date ) ) %>% ungroup() %>%
select(dunique,time,sentences,weekday,weekend),
by ="dunique"
)
# Set summary threshold
set_minutes <- 3
data <- data %>% mutate(
first_minutes = ifelse( start_time/60 < set_minutes, "yes", "no")
first_minutes = ifelse( start_time/60 <= set_minutes, "yes", "no")
)
# si existe clasificación manual de las noticias -------------------
data <- data %>% mutate(
caso = ifelse( caso == "dimision", "resignation",caso),
caso = ifelse( caso == "duda", "other",caso),
caso = ifelse( caso == "doubt", "other",caso),
caso = ifelse( caso == "otro", "other",caso),
caso = ifelse( caso == "robo", "robbery video scandal",caso),
caso = ifelse( caso == "master", "master scandal",caso)
)
# setting
mycolors <- c("master scandal"="#991122",
# "resignation"="#cc9999",
"robbery video scandal"="#666699",
"resignation"="#559933",
# "doubt"= "#999999",
"other"= "#444444" )
# COunt sentences per day
sentences_count <- data %>% mutate(
# prominence calculation 1
# position / total time + 1
prom = (time - start_time) / time +1,
# prominence 2nd formula
# If it is in first_minutes multiply by 2
in_intro = ifelse( first_minutes == "yes", 1, 0),
prom2 = ( (time - start_time) / time + 1 ) * (1 + in_intro)
) %>% group_by(date,telediario, caso,sentences,time) %>% summarise(
count = n(),
prominence = sum( prom),
prominence2 = sum( prom2)
) %>% mutate(
sentences_per = round( 100* count / sentences, digits=2)
)
# Plots ----------------------
# Plots ----------------------
# 1. Número de frases por telediario ------
png(filename = paste0("img/",caso_path ,"/apariciones-n-telediarios-",caso_path,"_01.png"), width = 1200,height = 700)
data %>%
......@@ -126,7 +173,7 @@ ggplot() +
# labs(title = paste("Número de frases sobre ",el_caso,sep = ""),
labs(title = paste("Number of sentences about ",el_caso,sep = ""),
subtitle = subtitle_text,
x = NULL,
x = time_text,
y = "number of sentences",
fill ="in first minutes?",
caption = caption_text) +
......@@ -142,14 +189,27 @@ ggplot() +
dev.off()
# 2. Cuando hablan del caso dentro del Telediario ----------
png(filename=paste0("img/",caso_path ,"/apariciones-cuando-telediarios_",caso_path,"_01.png"),width = 1200,height = 800)
png(filename=paste0("img/",caso_path ,"/apariciones-cuando-telediarios_",caso_path,"_01_c.png"),width = 1200,height = 800)
ggplot(data = data ) + #%>% filter( date> "2018-04-01" & date < "2018-04-05" )
# primer minuto y medio (90 segundos)
geom_rect(aes( xmin=min(data$date), xmax=max(data$date),
ymin=0,ymax=set_minutes*60/60), alpha = 0.2, fill = "lightgrey"
) +
annotate("text", x= min(data$date)+5, y= set_minutes/2, label=paste("First minutes",set_minutes), family = "Roboto Condensed") +
# geom_rect(aes( xmin=min(data$date), xmax=max(data$date),
# ymin=0,ymax=set_minutes*60/60), fill = "#3344bbee"
# ) +
geom_hline( yintercept = 3, color ="#3344bb", size=0.9) +
# background all sentences .............
geom_segment(data =telediarios %>% filter( date> "2018-03-18" & date < "2018-05-01" ),
aes(x = date, xend = date, y = seconds/60, yend = (seconds+30)/60),
alpha = 0.07, size=12, color="#bbbbbb") + #, color=caso
scale_y_continuous( limits=c(0,65)) +
# ...............
geom_segment( aes(x = date, xend = date, y = start_time/60, yend = (start_time+30)/60), alpha = 0.8, size=12) + #, color=caso
annotate("text", x= min(data$date)+5, y= set_minutes/2+4, label=paste("First minutes",set_minutes),
family = "Roboto Condensed", color ="#3344bb", size=5) +
geom_hline(aes(yintercept=0), size=0.1) +
# anotate
# geom_curve(aes(x = as.POSIXct("2018-03-24"), y = 0.5, xend = as.POSIXct("2018-03-26"), yend = 7),
......@@ -169,7 +229,7 @@ ggplot(data = data ) + #%>% filter( date> "2018-04-01" & date < "2018-04-05" )
) +
labs(title = paste("When they speak about ",el_caso, " in the news cast ",sep = ""),
subtitle = paste(subtitle_text),
x = NULL,
x = time_text,
y = "minutos since the start",
caption = caption_text) +
theme(
......@@ -182,26 +242,18 @@ ggplot(data = data ) + #%>% filter( date> "2018-04-01" & date < "2018-04-05" )
facet_wrap( ~telediario, ncol=1)
dev.off()
# si existe clasificación manual de las noticias -------------------
data <- data %>% mutate(
caso = ifelse( caso == "dimision", "resignation",caso),
caso = ifelse( caso == "duda", "doubt",caso),
caso = ifelse( caso == "otro", "other",caso),
caso = ifelse( caso == "robo", "robbery video scandal",caso),
caso = ifelse( caso == "master", "master scandal",caso)
)
mycolors <- c( "#999999","#991122","#cc9999","#559933","#666699" )
# 3. Número de frases por telediario: clasificadas ---------
# 3.A Número de frases por telediario: clasificadas ---------
png(filename=paste0("img/",caso_path ,"/apariciones-n-telediarios-",caso_path,"_clasificado_02.png"),width = 1200,height = 700)
data %>% filter(
# caso == "other"
caso != "doubt"
) %>% group_by(date, telediario, caso) %>% summarise(
count = n()
) %>%
ggplot( ) +
geom_col(aes( x=date, y=count, fill=caso )) + # si hay clasificación por caso: fill=caso
scale_fill_manual(values=mycolors) +
theme_minimal(base_family = "Roboto Condensed", base_size = 22) +
# scale_x_datetime(date_breaks = "1 day", date_labels = "%d") +
scale_x_date(
......@@ -214,11 +266,10 @@ ggplot( ) +
# sec.axis = sec_axis(~ .,
# labels = scales::time_format("%b/%y"))
) +
scale_fill_manual(values=mycolors) +
# labs(title = paste("Número de frases sobre ",el_caso,sep = ""),
labs(title = paste("Number of sentences about ",el_caso,sep = ""),
subtitle = subtitle_text,
x = NULL,
x = time_text,
# y = "nº de frases",
y = "number of sentences",
fill = "",
......@@ -234,53 +285,189 @@ ggplot( ) +
facet_wrap( ~telediario, ncol=1)
dev.off()
# 3.B % Número de frases por telediario: clasificadas ---------
png(filename=paste0("img/",caso_path ,"/apariciones-n-telediarios-",caso_path,"_clasificado_02_percentage.png"),width = 1200,height = 700)
sentences_count %>% filter(
# caso != "doubt"
) %>%
ggplot( ) +
geom_col(aes( x=date, y=sentences_per, fill=caso )) + # si hay clasificación por caso: fill=caso
# geom_col(aes( x=date+0.5, y=prominence, fill=caso ), width=0.1) +
scale_fill_manual(values=mycolors) +
theme_minimal(base_family = "Roboto Condensed", base_size = 22) +
# scale_x_datetime(date_breaks = "1 day", date_labels = "%d") +
scale_x_date(
date_breaks = "2 day",
minor_breaks = "1 day",
expand = c(0,0),
limits = date_limits,
labels = format_dates_days
#secondary axis to add months
# sec.axis = sec_axis(~ .,
# labels = scales::time_format("%b/%y"))
) +
# labs(title = paste("Número de frases sobre ",el_caso,sep = ""),
labs(title = paste("Percentage of sentences about ",el_caso,sep = ""),
subtitle = subtitle_text,
x = time_text,
# y = "nº de frases",
y = "% of sentences",
fill = "",
caption = caption_text) +
theme(
legend.position = "top",
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
# panel.grid.major.y = element_blank(),
axis.ticks = element_line(color="#888888")
) +
facet_wrap( ~telediario, ncol=1)
dev.off()
# 3.C Prominence 1 por telediario: clasificadas ---------
png(filename=paste0("img/",caso_path ,"/apariciones-n-telediarios-",caso_path,"_clasificado_02_prominence1.png"),width = 1200,height = 700)
sentences_count %>% filter(
# caso != "doubt"
) %>%
ggplot( ) +
geom_col(aes( x=date, y=prominence, fill=caso )) +
scale_fill_manual(values=mycolors) +
theme_minimal(base_family = "Roboto Condensed", base_size = 22) +
# scale_x_datetime(date_breaks = "1 day", date_labels = "%d") +
scale_x_date(
date_breaks = "2 day",
minor_breaks = "1 day",
expand = c(0,0),
limits = date_limits,
labels = format_dates_days
) +
labs(title = paste("Prominence method 1 of news about ",el_caso,sep = ""),
subtitle = subtitle_text,
x = time_text,
# y = "nº de frases",
y = "prominence",
fill = "",
caption = paste("Prominence for each sentence is calculated with: (total news program time - start_time) / total news program time + 1") ) +
theme(
legend.position = "top",
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
# panel.grid.major.y = element_blank(),
axis.ticks = element_line(color="#888888")
) +
facet_wrap( ~telediario, ncol=1)
dev.off()
# 3.d Prominence 1 por telediario: clasificadas ---------
png(filename=paste0("img/",caso_path ,"/apariciones-n-telediarios-",caso_path,"_clasificado_02_prominence2.png"),width = 1200,height = 700)
sentences_count %>% filter(
# caso != "doubt"
) %>%
ggplot( ) +
geom_col(aes( x=date, y=prominence2, fill=caso )) +
scale_fill_manual(values=mycolors) +
theme_minimal(base_family = "Roboto Condensed", base_size = 22) +
# scale_x_datetime(date_breaks = "1 day", date_labels = "%d") +