Commit effee153 authored by numeroteca's avatar numeroteca

add La razon+elespanol. Add cleaner plots % and nº

parent abf1e601
......@@ -6,7 +6,7 @@ library(gsubfn) # select text in the parenthesis with regex
library(tidyverse) # for ggplot
# select word in titles
word <- "Cifuentes|Enrique Álvarez Conde"
word <- "Cifuentes|Javier Ramos|Enrique Álvarez Conde|Pablo Chico|María Teresa Feito|Alicia López de los Cobos|Cecilia Rosado|Clara Souto|Amalia Calonge|Universidad Rey Juan Carlos"
# open compressed file
# gunzip("eldiario/http!www.eldiario.es!!!!@2018-04-06T23:01:07.888847+00:00.gz",remove=FALSE)
......@@ -55,10 +55,11 @@ percent <- n_select_news / n_news * 100
# ------- elmundo ----------------------
# reads html and stores it
pageelmundo <- read_html("eldiario/http!www.elmundo.es!!!!@2018-04-07T19:01:02.620498+00:00.html")
pageelmundo <- read_html("eldiario/http!www.elmundo.es!!!!@2018-04-07T19:01:02.620498+00:00_formated.html")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pageelmundo %>% html_nodes("article h3 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
titles <- pageelmundo %>% html_nodes("main article h3 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
titles
colnames(titles) <- "title"
# total of articles with link
......@@ -93,6 +94,27 @@ n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ------- La Razón ----------------------
# reads html and stores it
pagelarazon <- read_html("data/http!www.larazon.es!|!!!@2018-03-30T07:01:03.998265+00:00.gz", to="UTF-8") #TODO correct encoding
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pagelarazon %>% html_nodes("article h2 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
colnames(titles) <- "title"
titles
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ---------For a list of pages---------------
# Read list of files. the list has been generated with this bash script:
......@@ -105,6 +127,8 @@ list <- read.delim("data/mylist.txt")
# strapplyc(as.character(test), "[a-z]*!([a-z]{1,61}.[a-zA-Z]{2,})", simplify = TRUE)
# extract name of newspaper
list$urls <- sub("file './", "", list$urls )
list$urls <- sub("'", "", list$urls)
list$newspaper <- strapplyc( as.character(list$urls), "[a-z]*!([a-z]{1,61}.[a-zA-Z]{2,})", simplify = TRUE)
# list$newspaper <- as.factor(list$newspaper) No funciona convertirlo a factor
list$newspaper <- sub("www.", "", list$newspaper)
......@@ -119,10 +143,13 @@ list$hour <- as.numeric(strapplyc( as.character(list$urls), ".*@[0-9]*-[0-9]*-[0
list$date <- as.Date( paste(list$day,"/",list$month,"/",list$year,sep = "" ), "%d/%m/%Y")
list$timestamp <- as.POSIXlt( paste(list$year,"-",list$month,"-",list$day," ",list$hour,":00:00", sep = "" ))
save(list,file="data/list.Rda")
load("data/list.Rda")
# Create list of selected pages. Select timeframe, newspapers
selected <- list[(list$newspaper == "eldiario" | list$newspaper == "elconfidencial" |
list$newspaper == "elpais"| list$newspaper == "larazon") &
list$date > "2018-3-20", ]
list$newspaper == "elpais"| list$newspaper == "larazon" | list$newspaper == "elespanol") &
list$date > "2018-03-20", ]
results <- ""
results <- data.frame(matrix(ncol = 1,nrow = nrow(selected) ))
......@@ -131,11 +158,11 @@ names(results) <- c("newspaper")
for (i in 1:nrow(selected)) {
page <- read_html( paste("data/",selected$urls[i], sep = "") )
if ( selected$newspaper[i] == "eldiario" | selected$newspaper[i] == "elpais") {
if ( selected$newspaper[i] == "eldiario" | selected$newspaper[i] == "elpais" | selected$newspaper[i] == "larazon") {
# eldiario
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- page %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
} else if ( selected$newspaper[i] == "elconfidencial" | selected$newspaper[i] == "elmundo") {
} else if ( selected$newspaper[i] == "elconfidencial" | selected$newspaper[i] == "elmundo" | selected$newspaper[i] == "elespanol") {
titles <- page %>% html_nodes("article h3 a") %>% html_text() %>% data.frame()
}
colnames(titles) <- "title"
......@@ -143,7 +170,7 @@ for (i in 1:nrow(selected)) {
# total of articles with link
n_news <- nrow(titles)
print(n_news)
print(paste("nº noticias:",n_news))
# select news that contain cerating word
selected_news <- data.frame(titles[grepl(word, titles$title),])
......@@ -151,7 +178,8 @@ for (i in 1:nrow(selected)) {
# Results
# number of articles that contain words
n_selected_news<- nrow(selected_news)
print(n_selected_news)
print(paste("nº noticias Cifuentes:",n_selected_news))
# print(selected_news)
# Percentage of articles that contain words
percent <- n_selected_news / n_news * 100
......@@ -171,15 +199,118 @@ for (i in 1:nrow(selected)) {
# creates time stampt
results$date <- as.POSIXlt( paste(results$year,"-",results$month,"-",results$day," ",results$hour,":00:00", sep = "" ))
save(results,file="data/results.Rda")
# -----------Plots------------
# Plot para un único periódico
ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,100)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
# geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
# geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = "eldiario.es: noticias sobre Cifuentes en portada (total, total selected, %)")
ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,100)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
# geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
# geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = "eldiario.es: nº noticias en portada") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-09 00:00:01")))
ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,100)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
# geom_line(aes(x=date, y=percent),color="#0000DD") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-09 00:00:01"))) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
geom_text(aes(x = as.POSIXlt("2018-03-25 00:00:00"),
y = 19, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
color = "#FF0000", alpha=0.6, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-03-25 00:00:00"),
y = 45, label = "nº noticias en portada", family = "Roboto Condensed"),
color = "#000000", alpha=0.6, hjust = 0) +
labs(title = "eldiario.es: nº noticias en portada - noticias sobre Cifuentes",
subtitle = "21 marzo - 9 abril 2018. numeroteca.org",
x = NULL,
y = NULL,
caption = "")
ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,30)) +
# geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=percent),color="#0000DD") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d",
limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-09 00:00:01"))) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
geom_text(aes(x = as.POSIXlt("2018-03-27 00:00:00"),
y = 1, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
color = "#FF0000", alpha=0.6, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-03-27 00:00:00"),
y = 15, label = "Porcentaje de noticias en portada", family = "Roboto Condensed"),
color = "#0000DD", alpha=0.6, hjust = 0) +
labs(title = "eldiario.es: porcentaje noticias y nº noticias en portada sobre Cifuentes",
subtitle = "21 marzo - 9 abril 2018. Datos: numeroteca.org",
x = NULL,
y = NULL,
caption = "")
ggplot(data=results[results$newspaper=="elconfidencial",]) + ylim(c(0,10)) +
# geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=percent),color="#0000DD") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d",
limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-09 00:00:01"))) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
geom_text(aes(x = as.POSIXlt("2018-03-27 00:00:00"),
y = 5, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
color = "#FF0000", alpha=0.6, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-03-27 00:00:00"),
y = 3, label = "Porcentaje de noticias en portada", family = "Roboto Condensed"),
color = "#0000DD", alpha=0.6, hjust = 0) +
labs(title = "elconfidencial.es: porcentaje noticias y nº noticias en portada sobre Cifuentes",
subtitle = "21 marzo - 9 abril 2018. Datos: numeroteca.org",
x = "Días",
y = NULL,
caption = "")
ggplot(data=results[results$newspaper=="elconfidencial",]) + ylim(c(0,10)) +
# geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = "elDiario.es: noticias sobre Cifuentes en portada (total, total selected, %)")
scale_x_datetime(date_breaks = "1 day", date_labels = "%d",
limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-09 00:00:01"))) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
geom_text(aes(x = as.POSIXlt("2018-03-27 00:00:00"),
y = 5, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
color = "#FF0000", alpha=0.6, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-03-27 00:00:00"),
y = 3, label = "Porcentaje de noticias en portada", family = "Roboto Condensed"),
color = "#0000DD", alpha=0.6, hjust = 0) +
labs(title = "elconfidencial.es: porcentaje noticias y nº noticias en portada sobre Cifuentes",
subtitle = "21 marzo - 9 abril 2018. Datos: numeroteca.org",
x = "Días",
y = NULL,
caption = "")
ggplot(data=results[results$newspaper=="elconfidencial",]) + ylim(c(0,120)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
# geom_line(aes(x=date, y=percent),color="#0000DD") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-09 00:00:01"))) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
geom_text(aes(x = as.POSIXlt("2018-03-25 00:00:00"),
y = 19, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
color = "#FF0000", alpha=0.6, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-03-25 00:00:00"),
y = 45, label = "nº noticias en portada", family = "Roboto Condensed"),
color = "#000000", alpha=0.6, hjust = 0) +
labs(title = "elconfidencial.es: nº noticias en portada - noticias sobre Cifuentes",
subtitle = "21 marzo - 9 abril 2018. Datos: numeroteca.org",
x = NULL,
y = NULL,
caption = "")
# Plot para un único periódico
ggplot(data=results[results$newspaper=="elconfidencial",]) + ylim(c(0,130)) +
......@@ -197,15 +328,21 @@ ggplot(data=results[results$newspaper=="elpais",]) + ylim(c(0,130)) +
# Plot para un único periódico
ggplot(data=results[results$newspaper=="larazon",]) + ylim(c(0,130)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_news),color="#000000",size=0.1) +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = "laRazon: noticias sobre Cifuentes en portada (total, total selected, %)")
# Plot para un único periódico
ggplot(data=results[results$newspaper=="elespanol",]) + ylim(c(0,140)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = "elespanol: noticias sobre Cifuentes en portada (total, total selected, %)")
# Plot para varios periódico
ggplot(data=results ) + ylim(c(0,130)) +
geom_line(aes(x=date, y=n_news, group=newspaper),color="#000000") +
ggplot(data=results ) + ylim(c(0,140)) +
geom_line(aes(x=date, y=n_news, group=newspaper),color="#000000",size=0.2) +
geom_line(aes(x=date, y=n_selected_news, group=newspaper),color="#FF0000") +
geom_line(aes(x=date, y=percent, group=newspaper),color="#0000DD") +
labs(title = "elDiario - elconfidencial: noticias sobre Cifuentes en portada")
......@@ -214,8 +351,139 @@ ggplot(data=results ) + ylim(c(0,130)) +
ggplot(data=results ) + ylim(c(0,30)) +
# geom_line(aes(x=date, y=n_news, group=newspaper),color="#000000") +
# geom_line(aes(x=date, y=n_selected_news, group=newspaper),color="#FF0000") +
geom_line(aes(x=date, y=percent, group=newspaper),color="#0000DD") +
geom_line(aes(x=date, y=percent, group=newspaper),color="#0000DD",size=0.4,alpha=0.6) +
labs(title = "elDiario-elConfidencial-ElPais: % noticias sobre Cifuentes en portada")
# geom_text(aes(x = "2018-03-21", y = 500, label = "Gros"), color = "#9846dd", alpha=1) +
# geom_text(aes(x = "2007", y = 900, label = "Media"), color = "#000000", alpha=1) +
# geom_text(aes(x = "2007", y = 700, label = "Altza"), color = "#568ba5", alpha=1) +
# Plot para varios periódico ['#66c2a5','#fc8d62','#8da0cb','#e78ac3','#a6d854']
ggplot( ) + ylim(c(0,30)) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
geom_line(data=results[results$newspaper=="eldiario",], aes(x=date, y=percent, group=newspaper),color="#66c2a5",size=0.7) +
geom_line(data=results[results$newspaper=="elconfidencial",], aes(x=date, y=percent, group=newspaper),color="#fc8d62",size=0.7) +
geom_line(data=results[results$newspaper=="elpais",], aes(x=date, y=percent, group=newspaper),color="#8da0cb",size=0.7) +
geom_line(data=results[results$newspaper=="larazon",], aes(x=date, y=percent, group=newspaper),color="#e78ac3",size=0.7) +
geom_line(data=results[results$newspaper=="elespanol",], aes(x=date, y=percent, group=newspaper),color="#a6d854",size=0.7) +
labs(title = "Porcentaje de noticias sobre Cifuentes en portada (cada hora)") +
xlab("Días") +
ylab("% noticias en portadas") +
# theme(axis.text.y = element_text(size=10),
# # axis.title.y=element_blank(),
# axis.ticks.y =element_blank(),
# # axis.ticks.x =element_blank(),
# axis.text.x=element_text(size=9),
# axis.title.x=element_text(size=11),
# panel.grid.minor = element_blank(),
# panel.background = element_rect(fill="white"),
# panel.grid.major.y = element_line( size=.1, color="grey" ),
# # legend.position = "bottom",
# legend.text = element_text(size=15) ) +
scale_y_continuous(breaks=seq(0,30,5)) +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-12 12:00:01"))) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 11, label = "eldiario.es"), color = "#66c2a5", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 1, label = "larazon.es"), color = "#e78ac3", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 3, label = "elConfidencial.es"), color = "#fc8d62", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 5, label = "elespanol.com"), color = "#a6d854", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 2, label = "elpais.com"), color = "#8da0cb", alpha=1, hjust = 0) +
ylab ("% de noticias en portada") +
labs(title = "Porcentaje de noticias sobre Cifuentes en portada periódicos digitales",
subtitle = "21 marzo - 9 abril 2018. Datos: numeroteca.org",
x = "Días",
y = NULL,
caption = "")
# Plot para varios periódico
ggplot( ) + ylim(c(0,24)) +
geom_line(data=results[results$newspaper=="eldiario",], aes(x=date, y=n_selected_news, group=newspaper),color="#66c2a5",size=0.7) +
geom_line(data=results[results$newspaper=="elconfidencial",], aes(x=date, y=n_selected_news, group=newspaper),color="#fc8d62",size=0.7) +
geom_line(data=results[results$newspaper=="elpais",], aes(x=date, y=n_selected_news, group=newspaper),color="#8da0cb",size=0.7) +
geom_line(data=results[results$newspaper=="larazon",], aes(x=date, y=n_selected_news, group=newspaper),color="#e78ac3",size=0.7) +
geom_line(data=results[results$newspaper=="elespanol",], aes(x=date, y=n_selected_news, group=newspaper),color="#a6d854",size=0.7) +
labs(title = "Número de noticias sobre Cifuentes en portada (cada hora)") +
xlab("Días") +
ylab("nº noticias en portadas") +
theme(axis.text.y = element_text(size=10),
# axis.title.y=element_blank(),
axis.ticks.y =element_blank(),
axis.ticks.x =element_blank(),
axis.text.x=element_text(size=9),
axis.title.x=element_text(size=11),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill="white"),
panel.grid.major.y = element_line( size=.1, color="grey" ),
# legend.position = "bottom",
legend.text = element_text(size=15) ) +
# scale_y_continuous(breaks=seq(0,30,5)) +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-12 12:00:01"))) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 8, label = "eldiario.es"), color = "#66c2a5", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 1, label = "larazon.es"), color = "#e78ac3", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 3, label = "elConfidencial.es"), color = "#fc8d62", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 5, label = "elespanol.com"), color = "#a6d854", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 2, label = "elpais.com"), color = "#8da0cb", alpha=1, hjust = 0)
#
#
#
for (i in 1:nrow(selected)) {
if ( selected$newspaper[i] == "larazon") {
page <- read_html( paste("data/",selected$urls[i], sep = "") )
titles <- page %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
colnames(titles) <- "title"
# titles$title <- as.character(titles$title)
# total of articles with link
n_news <- nrow(titles)
print(paste("nº noticias:",n_news))
# select news that contain cerating word
selected_news <- data.frame(titles[grepl(word, titles$title),])
print(selected_news)
# Results
# number of articles that contain words
n_selected_news<- nrow(selected_news)
print(n_selected_news)
print(paste("nº noticias Cifuentes:",n_selected_news))
# Percentage of articles that contain words
print(paste("day:",selected$day[i],"hour:",selected$hour[i],selected$newspaper[i]))
}
}
theme_min = function (size=10, font=NA, face='plain',
panelColor=backgroundColor, axisColor='#999999',
gridColor=gridLinesColor, textColor='black')
{
theme_text = function(...)
ggplot2::theme_text(family=font, face=face, colour=textColor,
size=size, ...)
opts(
axis.text.x = theme_text(),
axis.text.y = theme_text(),
axis.line = theme_blank(),
axis.ticks = theme_segment(colour=axisColor, size=0.25),
panel.border = theme_rect(colour=backgroundColor),
legend.background = theme_blank(),
legend.key = theme_blank(),
legend.key.size = unit(1.5, 'lines'),
legend.text = theme_text(hjust=0),
legend.title = theme_text(hjust=0),
panel.background = theme_rect(fill=panelColor, colour=NA),
panel.grid.major = theme_line(colour=gridColor, size=0.33),
panel.grid.minor = theme_blank(),
strip.background = theme_rect(fill=NA, colour=NA),
strip.text.x = theme_text(hjust=0),
strip.text.y = theme_text(angle=-90),
plot.title = theme_text(hjust=0),
plot.margin = unit(c(0.1, 0.1, 0.1, 0.1), 'lines'))
}
##Create a custom font type. Could be 'F', 'TEST', whatever
windowsFonts(F = windowsFont('Wide Latin'))
##and insert this line of code into the original code I list above:
+ theme_min(font='F', size=10)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment