Commit 57c5399f authored by numeroteca's avatar numeroteca

calculates total and % for 1 newspaper in period of time

parents
# Load libraries
library(rvest)
library(stringr)
library(R.utils) # opens gzip compresed file
library(gsubfn) # select text in the parenthesis with regex
library(tidyverse) # for ggplot
# select word in titles
word <- "Cifuentes|Enrique Álvarez Conde"
# open compressed file
gunzip("eldiario/http!www.eldiario.es!!!!@2018-04-06T23:01:07.888847+00:00.gz",remove=FALSE)
# reads html and stores it
page <- read_html("eldiario/http!www.eldiario.es!!!!@2018-04-06T23:01:07.888847+00:00.gz")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- page %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
colnames(titles) <- "title"
titles$title <- as.character(titles$title)
# total of articles with link
n_news <- nrow(titles)
pagex <- read_html( paste("data/","http!www.eldiario.es!!!!@2018-04-05T10:01:07.216603+00:00.gz", sep = "") )
titlesx <- pagex %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
colnames(titlesx) <- "title"
titlesx$title <- as.character(titlesx$title)
n_newsx <- nrow(titlesx)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# For a list of pages
# Read list of files. the list has been generated with this bash script:
# for f in *.gz; do echo "$f" >> mylist.txt; done
list <- read.delim("data/mylist.txt")
# strapplyc("http!cadenaser.com!|!!!@2017-06-14T12:01:22.057046+00:00.gz", "http!([a-z]{1,61}.[a-zA-Z]{2,})", simplify = TRUE)
# test <- "https!www.cadenaser.com!|!!!@2017-06-14T12:01:22.057046+00:00.gz"
# strapplyc(test, "[a-z]*!(.*)", simplify = TRUE)
# strapplyc(as.character(test), "[a-z]*!([a-z]{1,61}.[a-zA-Z]{2,})", simplify = TRUE)
# extract name of newspaper
list$newspaper <- strapplyc( as.character(list$urls), "[a-z]*!([a-z]{1,61}.[a-zA-Z]{2,})", simplify = TRUE)
# list$newspaper <- as.factor(list$newspaper) No funciona convertirlo a factor
list$newspaper <- sub("www.", "", list$newspaper)
# extract year,month, day, hour
list$year <- as.numeric(strapplyc( as.character(list$urls), ".*@([0-9]*)", simplify = TRUE))
list$month <- as.numeric(strapplyc( as.character(list$urls), ".*@[0-9]*-([0-9]*)", simplify = TRUE))
list$day <- as.numeric(strapplyc( as.character(list$urls), ".*@[0-9]*-[0-9]*-([0-9]*)", simplify = TRUE))
list$hour <- as.numeric(strapplyc( as.character(list$urls), ".*@[0-9]*-[0-9]*-[0-9]*T([0-9]*)", simplify = TRUE))
# create date
list$date <- as.Date( paste(list$day,"/",list$month,"/",list$year,sep = "" ), "%d/%m/%Y")
# Create list of selected pages
selected <- list[list$newspaper == "eldiario" & list$date > "2018-3-20", ]
results <- ""
results <- data.frame(matrix(ncol = 1,nrow = nrow(selected) ))
names(results) <- c("newspaper")
for (i in 1:nrow(selected)) {
page <- read_html( paste("data/",selected$urls[i], sep = "") )
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- page %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
colnames(titles) <- "title"
titles$title <- as.character(titles$title)
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
selected_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_selected_news<- nrow(selected_news)
# Percentage of articles that contain words
percent <- n_selected_news / n_news * 100
results$newspaper[i] <- selected$newspaper[i]
# results$date[i] <- paste(selected$day[i],"/",selected$month[i],"/",selected$year[i],sep = "" )
# results$date2[i] <- as.Date(selected$date[i])
results$day[i] <- selected$day[i]
results$month[i] <- selected$month[i]
results$year[i] <- selected$year[i]
results$hour[i] <- selected$hour[i]
results$n_news[i] <- n_news
results$n_selected_news[i] <- as.integer(n_selected_news)
results$percent[i] <- percent
print(selected$day[i])
}
# creates time stampt
results$date <- as.POSIXlt( paste(results$year,"-",results$month,"-",results$day," ",results$hour,":00:00", sep = "" ))
ggplot(data=results ) + ylim(c(0,100)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = "elDiario.es: noticias sobre Cifuentes en portada (total, total selected, %)")
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment