Commit abf1e601 authored by numeroteca's avatar numeroteca

adds multiple newspapers: elpais + elconfidencial

parent 57c5399f
# Load libraries
library(rvest)
library(stringr)
library(R.utils) # opens gzip compresed file
# library(R.utils) # opens gzip compresed file
library(gsubfn) # select text in the parenthesis with regex
library(tidyverse) # for ggplot
......@@ -9,10 +9,12 @@ library(tidyverse) # for ggplot
word <- "Cifuentes|Enrique Álvarez Conde"
# open compressed file
gunzip("eldiario/http!www.eldiario.es!!!!@2018-04-06T23:01:07.888847+00:00.gz",remove=FALSE)
# gunzip("eldiario/http!www.eldiario.es!!!!@2018-04-06T23:01:07.888847+00:00.gz",remove=FALSE)
# no hace falta, read_html lee el .gz sin necesidad de descomprimir
# ------- eldiario.es ----------------------
# reads html and stores it
page <- read_html("eldiario/http!www.eldiario.es!!!!@2018-04-06T23:01:07.888847+00:00.gz")
page <- read_html("data/http!www.eldiario.es!!!!@2018-04-06T23:01:07.888847+00:00.gz")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- page %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
......@@ -22,11 +24,25 @@ titles$title <- as.character(titles$title)
# total of articles with link
n_news <- nrow(titles)
pagex <- read_html( paste("data/","http!www.eldiario.es!!!!@2018-04-05T10:01:07.216603+00:00.gz", sep = "") )
titlesx <- pagex %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
colnames(titlesx) <- "title"
titlesx$title <- as.character(titlesx$title)
n_newsx <- nrow(titlesx)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ------- elconfidencial ----------------------
# reads html and stores it
page <- read_html("data/http!www.elconfidencial.com!|!!!@2018-03-22T09:01:11.318750+00:00.gz")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- page %>% html_nodes("article h3 a") %>% html_text() %>% data.frame()
colnames(titles) <- "title"
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
......@@ -37,7 +53,47 @@ n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# For a list of pages
# ------- elmundo ----------------------
# reads html and stores it
pageelmundo <- read_html("eldiario/http!www.elmundo.es!!!!@2018-04-07T19:01:02.620498+00:00.html")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pageelmundo %>% html_nodes("article h3 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
colnames(titles) <- "title"
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ------- elpais ----------------------
# reads html and stores it
pageelpais <- read_html("data/http!www.elpais.com!!!!@2017-07-04T13:51:08.133418+00:00.gz")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pageelpais %>% html_nodes("article h2 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
colnames(titles) <- "title"
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ---------For a list of pages---------------
# Read list of files. the list has been generated with this bash script:
# for f in *.gz; do echo "$f" >> mylist.txt; done
......@@ -61,9 +117,12 @@ list$hour <- as.numeric(strapplyc( as.character(list$urls), ".*@[0-9]*-[0-9]*-[0
# create date
list$date <- as.Date( paste(list$day,"/",list$month,"/",list$year,sep = "" ), "%d/%m/%Y")
list$timestamp <- as.POSIXlt( paste(list$year,"-",list$month,"-",list$day," ",list$hour,":00:00", sep = "" ))
# Create list of selected pages
selected <- list[list$newspaper == "eldiario" & list$date > "2018-3-20", ]
# Create list of selected pages. Select timeframe, newspapers
selected <- list[(list$newspaper == "eldiario" | list$newspaper == "elconfidencial" |
list$newspaper == "elpais"| list$newspaper == "larazon") &
list$date > "2018-3-20", ]
results <- ""
results <- data.frame(matrix(ncol = 1,nrow = nrow(selected) ))
......@@ -72,13 +131,19 @@ names(results) <- c("newspaper")
for (i in 1:nrow(selected)) {
page <- read_html( paste("data/",selected$urls[i], sep = "") )
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- page %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
if ( selected$newspaper[i] == "eldiario" | selected$newspaper[i] == "elpais") {
# eldiario
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- page %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
} else if ( selected$newspaper[i] == "elconfidencial" | selected$newspaper[i] == "elmundo") {
titles <- page %>% html_nodes("article h3 a") %>% html_text() %>% data.frame()
}
colnames(titles) <- "title"
titles$title <- as.character(titles$title)
# titles$title <- as.character(titles$title)
# total of articles with link
n_news <- nrow(titles)
n_news <- nrow(titles)
print(n_news)
# select news that contain cerating word
selected_news <- data.frame(titles[grepl(word, titles$title),])
......@@ -86,6 +151,7 @@ for (i in 1:nrow(selected)) {
# Results
# number of articles that contain words
n_selected_news<- nrow(selected_news)
print(n_selected_news)
# Percentage of articles that contain words
percent <- n_selected_news / n_news * 100
......@@ -100,14 +166,56 @@ for (i in 1:nrow(selected)) {
results$n_selected_news[i] <- as.integer(n_selected_news)
results$percent[i] <- percent
print(selected$day[i])
print(paste("day:",selected$day[i],"hour:",selected$hour[i],selected$newspaper[i]))
}
# creates time stampt
results$date <- as.POSIXlt( paste(results$year,"-",results$month,"-",results$day," ",results$hour,":00:00", sep = "" ))
ggplot(data=results ) + ylim(c(0,100)) +
# -----------Plots------------
# Plot para un único periódico
ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,100)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = "elDiario.es: noticias sobre Cifuentes en portada (total, total selected, %)")
# Plot para un único periódico
ggplot(data=results[results$newspaper=="elconfidencial",]) + ylim(c(0,130)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = "elconfidencial: noticias sobre Cifuentes en portada (total, total selected, %)")
# Plot para un único periódico
ggplot(data=results[results$newspaper=="elpais",]) + ylim(c(0,130)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = "elPais: noticias sobre Cifuentes en portada (total, total selected, %)")
# Plot para un único periódico
ggplot(data=results[results$newspaper=="larazon",]) + ylim(c(0,130)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = "laRazon: noticias sobre Cifuentes en portada (total, total selected, %)")
# Plot para varios periódico
ggplot(data=results ) + ylim(c(0,130)) +
geom_line(aes(x=date, y=n_news, group=newspaper),color="#000000") +
geom_line(aes(x=date, y=n_selected_news, group=newspaper),color="#FF0000") +
geom_line(aes(x=date, y=percent, group=newspaper),color="#0000DD") +
labs(title = "elDiario - elconfidencial: noticias sobre Cifuentes en portada")
# Plot para varios periódico
ggplot(data=results ) + ylim(c(0,30)) +
# geom_line(aes(x=date, y=n_news, group=newspaper),color="#000000") +
# geom_line(aes(x=date, y=n_selected_news, group=newspaper),color="#FF0000") +
geom_line(aes(x=date, y=percent, group=newspaper),color="#0000DD") +
labs(title = "elDiario-elConfidencial-ElPais: % noticias sobre Cifuentes en portada")
# geom_text(aes(x = "2018-03-21", y = 500, label = "Gros"), color = "#9846dd", alpha=1) +
# geom_text(aes(x = "2007", y = 900, label = "Media"), color = "#000000", alpha=1) +
# geom_text(aes(x = "2007", y = 700, label = "Altza"), color = "#568ba5", alpha=1) +
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment