Commit 384f06e1 authored by numeroteca's avatar numeroteca

fix parser for elmundo, add infolibre. SER and ABC fail. Add test html

parent a7db199a
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -10,16 +10,26 @@ library(gsubfn) # select text in the parenthesis with regex
library(tidyverse) # for ggplot
# Set search variables: words and datelimits -----
word <- "Cifuentes|Javier Ramos|Enrique Álvarez Conde|Pablo Chico|María Teresa Feito|Alicia López de los Cobos|Cecilia Rosado|Clara Souto|Amalia Calonge|Universidad Rey Juan Carlos|URJC"
word <- "fondos reservados|caja b|Jorge Fernández Díaz|Bárcenas|destruir pruebas|Kitchen|Fernández Díaz"
word$explain <- "Fondos reservados para defender al PP"
# word <- "Cifuentes|Javier Ramos|Enrique Álvarez Conde|Pablo Chico|María Teresa Feito|Alicia López de los Cobos|Cecilia Rosado|Clara Souto|Amalia Calonge|Universidad Rey Juan Carlos|URJC"
# word <- "vox|Vox|VOX|Santiago Abascal|ortega smith|francisco serrano"
# Caso Fondos reservados para defender al PP
word1 <- "fondos reservados|caja b|Jorge Fernández Díaz|Bárcenas|destruir pruebas|Kitchen|Fernández Díaz"
word2 <- "PP"
word3 <- "caja b"
word4 <- "Bárcenas"
word5 <- "Kitchen"
word6 <- "Jorge Fernández Díaz|Fernández Díaz"
# Caso Master Cifuentes
word1 <- "Cifuentes|Javier Ramos|Enrique Álvarez Conde|Pablo Chico|María Teresa Feito|Alicia López de los Cobos|Cecilia Rosado|Clara Souto|Amalia Calonge|Universidad Rey Juan Carlos|URJC"
word2 <- "Cifuentes"
word3 <- "Cristina Cifuentes"
word4 <- "Universidad Rey Juan Carlos"
word5 <- "URJC"
word6 <- "Enrique Álvarez Conde"
# word1 <- "Cifuentes|Javier Ramos|Enrique Álvarez Conde|Pablo Chico|María Teresa Feito|Alicia López de los Cobos|Cecilia Rosado|Clara Souto|Amalia Calonge|Universidad Rey Juan Carlos|URJC"
# word2 <- "Cifuentes"
# word3 <- "Cristina Cifuentes"
# word4 <- "Universidad Rey Juan Carlos"
# word5 <- "URJC"
# word6 <- "Enrique Álvarez Conde"
# partidos y líderes
# word1 <- "vox|Vox|VOX|Santiago Abascal|Abascal"
......@@ -48,13 +58,22 @@ word$word[4] <- word4
word$word[5] <- word5
word$word[6] <- word6
# Select word to be displayed in plots
word1_explain <- "Caso Master"
word2_explain <- "Cifuentes"
word3_explain <- "Cristina Cifuentes"
word4_explain <- "Universidad Rey Juan Carlos"
word5_explain <- "URJC"
word6_explain <- "Enrique Álvarez Conde"
word1_explain <- "PPgate"
word2_explain <- "PP"
word3_explain <- "cajab"
word4_explain <- "Barcenas"
word5_explain <- "kitchen"
word6_explain <- "Jorge_Fdez_diaz"
# Select word to be displayed in plots
# word1_explain <- "Caso Master"
# word2_explain <- "Cifuentes"
# word3_explain <- "Cristina Cifuentes"
# word4_explain <- "Universidad Rey Juan Carlos"
# word5_explain <- "URJC"
# word6_explain <- "Enrique Álvarez Conde"
# # Select word to be displayed in plots
# word1_explain <- "VOX" #
......@@ -79,10 +98,6 @@ word$explain[4] <- word4_explain
word$explain[5] <- word5_explain
word$explain[6] <- word6_explain
# Set time limits
my_limit <- c(as.POSIXct("2018-03-20 00:00:01"), as.POSIXct("2018-04-30 00:23:08"))
my_init <- as.POSIXlt("2018-03-20 00:00:00")
# open compressed file
# gunzip("eldiario/http!www.eldiario.es!!!!@2018-04-06T23:01:07.888847+00:00.gz",remove=FALSE)
# no hace falta, read_html lee el .gz sin necesidad de descomprimir
......@@ -131,11 +146,10 @@ percent <- n_select_news / n_news * 100
# ------- elmundo ----------------------
# reads html and stores it
pageelmundo <- read_html("eldiario/http!www.elmundo.es!!!!@2018-04-07T19:01:02.620498+00:00_formated.html")
pageelmundo <- read_html("homepages_test/http!www.elmundo.es!!!!@2020-02-04T21:01:03.395289+00:00.gz")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pageelmundo %>% html_nodes("main article h3 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
titles
titles <- pageelmundo %>% html_nodes("main article h2") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
colnames(titles) <- "title"
# total of articles with link
......@@ -172,10 +186,10 @@ percent <- n_select_news / n_news * 100
# ------- La Razón ----------------------
# reads html and stores it
pagelarazon <- read_html("data/http!www.larazon.es!|!!!@2018-03-30T07:01:03.998265+00:00.gz", to="UTF-8") #TODO correct encoding
pagelarazon <- read_html("homepages_test/http!www.larazon.es!|!!!@2020-02-04T21:01:04.579808+00:00.gz", to="UTF-8") #TODO correct encoding
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pagelarazon %>% html_nodes("article h2 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
titles <- pagelarazon %>% html_nodes("h2") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
colnames(titles) <- "title"
titles
......@@ -217,6 +231,76 @@ percent <- n_select_news / n_news * 100
# }
# }
# ------- Cadenaser ----------------------
# reads html and stores it
pageser <- read_html("homepages_test/http!cadenaser.com!|!!!@2020-02-04T21:01:45.200707+00:00.gz")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pageser %>% html_nodes("article h2 a") %>% html_text() %>% data.frame() #HA dejado de funcionar, parace que usan algo de js:
# {"@context":"https:\/\/schema.org","@type":"NewsArticle",
# "mainEntityOfPage":{"@type":"WebPage","@id":"\/\/cadenaser.com\/ser\/2020\/02\/03\/ciencia\/1580724778_294079.html"},
# "url":"\/\/cadenaser.com\/ser\/2020\/02\/03\/ciencia\/1580724778_294079.html",
# "headline":"Google te conoce mejor que nadie: descubre cu\u00e1les son tus intereses seg\u00fan el buscador",
# "author":{"@type":"Person","name":"David Justo"},
# "contentLocation":{"@type":"Place","name":"Madrid"},
# "datePublished":"2020-02-03T12:23:58+01:00","dateModified":"2020-02-03T12:23:58+01:00",
# "image":{"@type":"ImageObject","url":"https:\/\/cadenaser00.epimg.net\/\/cadenaser00.epimg.net\/ser\/imagenes\/2020\/02\/03\/ciencia\/1580724778_294079_1580728804_portada_normal.jpg",
# "width":"720","height":"720"},
# "publisher":{"@type":"Organization","url":"https:\/\/cadenaser.com","name":"Cadena SER",
# "logo":{"@type":"ImageObject","url":"https:\/\/cadenaser00.epimg.net\/ser\/iconos\/v1.x\/v1.0\/logos\/logo_ser_cabecera_rss.png",
# "width":"600","height":"60"}}}
colnames(titles) <- "title"
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ------- ABC ----------------------
# reads html and stores it
pageabc <- read_html("homepages_test/http!www.abc.es!|!!!@2020-02-04T21:01:03.836719+00:00.gz")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pageabc %>% html_nodes("article h3 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
colnames(titles) <- "title"
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ------- Infolibre ----------------------
# reads html and stores it
pageinfolibre <- read_html("homepages_test/https!www.infolibre.es!|!!!@2020-02-04T21:01:11.035956+00:00.gz")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pageinfolibre %>% html_nodes("#ctd h2 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
colnames(titles) <- "title"
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# --------- Create data frame with all the newspapers + time and date---------------
# Read list of files. The list has been generated with this bash script,
......@@ -255,13 +339,19 @@ load("data/list.Rda")
ggplot(list[list$newspaper == "larazon", ]) +
geom_point(aes(x=date,y=month), alpha = 0.005)
# which newspapers are stored, how many homepages each?
table(list$newspaper)
# Process home pages ---------------------------
# Create list of selected pages. Select timeframe, newspapers
# Create list of selected pages. Select timeframe and newspapers
selected <- list[(list$newspaper == "eldiario" | list$newspaper == "elconfidencial" |
list$newspaper == "elpais"| list$newspaper == "larazon" |
list$newspaper == "elespanol" | list$newspaper == "cadenaser.com") &
list$date > "2018-03-20" & list$date < "2018-04-30", ]
list$newspaper == "elpais"|
# list$newspaper == "larazon" |
list$newspaper == "elespanol" |
# list$newspaper == "cadenaser.com" |
list$newspaper == "abc" | list$newspaper == "elmundo" | list$newspaper == "infolibre") &
list$date > "2020-01-20" & list$date < "2020-02-05", ]
# Create results dataframe
results <- ""
......@@ -271,16 +361,31 @@ names(results) <- c("newspaper")
# Loop to count how many titles per homepages have certain words
for (i in 1:nrow(selected)) {
page <- read_html( paste("../storytracker/data-homepages/",selected$urls[i], sep = "") )
# page <- read_html("homepages_test/http!www.elmundo.es!!!!@2020-02-04T21:01:03.395289+00:00.gz")
if ( selected$newspaper[i] == "eldiario" | selected$newspaper[i] == "elpais" | selected$newspaper[i] == "larazon") {
# Depending on the newspaper defines the parser to select the news
if ( selected$newspaper[i] == "eldiario" | selected$newspaper[i] == "elpais" ) {
# eldiario
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- page %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
} else if ( selected$newspaper[i] == "elmundo" ) {
print("elmundo")
titles <- page %>% html_nodes("main article h2") %>% html_text() %>% data.frame()
} else if ( selected$newspaper[i] == "larazon" ) {
print("larazon")
titles <- page %>% html_nodes("h2") %>% html_text() %>% data.frame()
} else if ( selected$newspaper[i] == "cadenaser.com") {
print("cadenaser")
titles <- page %>% html_nodes("h2 a") %>% html_text() %>% data.frame()
} else if ( selected$newspaper[i] == "elconfidencial" | selected$newspaper[i] == "elmundo" | selected$newspaper[i] == "elespanol") {
titles <- page %>% html_nodes("article h3 a") %>% html_text() %>% data.frame()
} else if ( selected$newspaper[i] == "infolibre" ) {
titles <- page %>% html_nodes("#ctd h2 a") %>% html_text() %>% data.frame()
} else if ( selected$newspaper[i] == "abc" ) {
titles <- page %>% html_nodes("article h3 a") %>% html_text() %>% data.frame()
}
# renames columns name to title
colnames(titles) <- "title"
# titles$title <- as.character(titles$title)
......@@ -295,43 +400,51 @@ for (i in 1:nrow(selected)) {
# total of articles with link
n_news <- nrow(titles)
print(paste("nº noticias:",n_news))
print(paste("------------- nº noticias:",n_news,"in",selected$newspaper[i]))
results$n_news[i] <- n_news
# select news that contain cerating
for (j in 1:nrow(word)) {
# for (j in 1:5) {
selected_news <- data.frame(titles[grepl(word$word[j], titles$title),])
# select news that contain certaing word
# for (j in 1:1) { # for one word
# for (j in 1:5) { # for 5 words
for (j in 1:nrow(word)) { # for multiple words
selected_news <- data.frame(titles[grepl(word$word[j], titles$title),]) # multiple words
# selected_news <- data.frame(titles[grepl(word, titles$title),]) # one word
# Results
# number of articles that contain words
n_selected_news <- nrow(selected_news)
print(paste("1. nº noticias con las palabras",word$explain[j],n_selected_news))
print(paste("1. nº noticias con las palabras",word$explain[j],n_selected_news)) # multiple words
# print(paste("1. nº noticias con las palabras",word$explain[j],n_selected_news)) # one word
# print(selected_news)
# Percentage of articles that contain words
percent <- round(n_selected_news / n_news * 100, digits = 2)
print(paste("2. percent ",word$explain[j],percent))
# results$a <- 0
# results$b <- 0
# results$c <- 0
# results$d <- 0
# results$e <- 0
# results$f <- 0
# results$aa <- 0
# results$bb <- 0
# results$cc <- 0
# results$dd <- 0
# results$ee <- 0
# results$ff <- 0
# results$aaa <- 0
# results$bbb <- 0
# results$ccc <- 0
# results$ddd <- 0
# results$eee <- 0
# results$fff <- 0
# creates columns that will be renamed afterwrads
print("creates fake columns")
results$a <- 0
results$b <- 0
results$c <- 0
results$d <- 0
results$e <- 0
results$f <- 0
results$aa <- 0
results$bb <- 0
results$cc <- 0
results$dd <- 0
results$ee <- 0
results$ff <- 0
results$aaa <- 0
results$bbb <- 0
results$ccc <- 0
results$ddd <- 0
results$eee <- 0
results$fff <- 0
# create column with variable names
# TODO: this is cool feature but a pain when it comes to producing the plots
names(results)[6+j] <- paste0("n_selected_news_",j,"_",word$explain[j])
names(results)[7+j+nrow(word)] <- paste0("percent_",j,"_",word$explain[j])
names(results)[8+j+2*nrow(word)] <- paste0("titles_",j,"_",word$explain[j])
......@@ -346,7 +459,11 @@ for (i in 1:nrow(selected)) {
print(paste("year:",selected$year[i], "month:",selected$month[i], "day:",selected$day[i],"hour:",selected$hour[i],selected$newspaper[i]))
}
# results <- results %>% select(-a,-b,-c,-d,-e,-f,-aa,-bb,-cc,-dd,-ee,-ff,-aaa,-bbb,-ccc,-ddd,-eee,-fff)
# removes unused columns
results <- results %>% select(-a,-b,-c,-d,-e,-f,-aa,-bb,-cc,-dd,-ee,-ff,-aaa,-bbb,-ccc,-ddd,-eee,-fff)
# creates time stampt
results$date <- as.POSIXct( paste(results$year,"-",results$month,"-",results$day," ",results$hour,":00:00", sep = "" ))
# results5 <- ""
# results5 <- data.frame(matrix(ncol = 1,nrow = 5 ))
......@@ -358,12 +475,10 @@ for (i in 1:nrow(selected)) {
# writeLines(iconv(readLines("tmp.html"), from = "ANSI_X3.4-1986", to = "UTF8"), "tmp2.html")
# iconv(selected_news2[,1], 'utf-8', 'ascii', sub='')
# creates time stampt
results$date <- as.POSIXct( paste(results$year,"-",results$month,"-",results$day," ",results$hour,":00:00", sep = "" ))
# Save results
save(results,file="data/results-ppgate_01.Rda")
# save(results,file="data/results-vox-01.Rda")
save(results,file="data/results-generales2019-6-partidos-1-25abril.Rda")
# save(results,file="data/results-generales2019-6-partidos-1-25abril.Rda")
# save(results,file="data/results-vox-diferencias-busqueda.Rda")
# Load other results
# load("data/results-cifuentes-01.Rda")
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment