Commit 84f1314e authored by numeroteca's avatar numeroteca

creates README, reorder/clean script, add example results data

parent e1258b8d
Pipeline #14 failed with stages
Homenewscounter
==============
R script to count how many titles per hour that have certain words in a bunch of home pages of online newspapers archive.
# How to use Homenewscounter
## Get your copy of the home pages
Ask @numeroteca / http://numeroteca.org.
## Create the list of newspaper home pages
Run this where you have your downloaded files stored.
`for f in *.gz; do echo "$f" >> mylist.txt; done`
The `mylist.txt` has all the names of the .gz files that contain the html of the home pages.
## Create data frame with all the newspaper names time and date
Based on `mylist.txt` and using `html-parser.R` creates a `results.Rda` file with:
+ number of titles in home page
+ number of selected titles in home page that have certain selected words
+ percentage of titles that have certain words from the total in that home page.
You can load existing results files like `data/results-cifuentes-01.Rda`.
## Plots visualizations based on results
A series of visualizations to view the results obtained.
# FAQ
## Where are home pages html coming from?
We are using Storytracker (http://storytracker.pastpages.org/en/latest/) to store a list of newspaper home page every hour.
## Which newspapers are you storing?
### Spanish media
1. http://www.elpais.com
1. http://www.elmundo.es
1. http://www.abc.es/
1. http://www.larazon.es/
1. http://www.lavanguardia.com/
1. http://www.elperiodico.com/es/
1. http://www.ara.cat/
1. http://www.eldiario.es
1. http://www.elespanol.com
1. http://www.publico.es/
1. http://www.20minutos.es/
1. http://www.huffingtonpost.es/
1. https://www.infolibre.es/
1. http://www.elconfidencial.com/
1. http://www.rtve.es/
1. http://cadenaser.com/
1. http://www.cope.es/
1. http://www.ondacero.es/
1. http://www.efe.com/
1. http://esradio.libertaddigital.com/
1. http://www.libertaddigital.com/
1. http://www.vozpopuli.com/
1. http://www.lavozdegalicia.es/
1. http://www.elcorreo.com/
1. http://www.ccma.cat/tv3/
1. http://www.telemadrid.es/
1. http://elprogreso.galiciae.com/
1. https://okdiario.com
1. https://www.lamarea.com/
1. https://www.elsaltodiario.com/
1. https://www.naiz.eus/hemeroteca/gara
1. https://www.berria.eus/
1. https://www.naiz.eus/
1. http://www.diariovasco.com/
1. http://www.deia.com/
US and UK media
1. https://www.nytimes.com/
1. https://www.theguardian.com
No preview for this file type
# Load libraries
# Script to count how many titles per hour that have certain words in home pages of newspapers.
# It generates some plots to view the data
# Ask @numeroteca or info@montera34.com for questions, suggestions and collaborations
# Load libraries -----
library(rvest)
library(stringr)
# library(R.utils) # opens gzip compresed file
library(gsubfn) # select text in the parenthesis with regex
library(tidyverse) # for ggplot
# select word in titles
word <- "Cifuentes|Javier Ramos|Enrique Álvarez Conde|Pablo Chico|María Teresa Feito|Alicia López de los Cobos|Cecilia Rosado|Clara Souto|Amalia Calonge|Universidad Rey Juan Carlos"
# Set search variables: words and datelimits -----
# word <- "Cifuentes|Javier Ramos|Enrique Álvarez Conde|Pablo Chico|María Teresa Feito|Alicia López de los Cobos|Cecilia Rosado|Clara Souto|Amalia Calonge|Universidad Rey Juan Carlos"
word <- "vox|Vox|VOX|Santiago Abascal|ortega smith|francisco serrano"
# Select word to be displayed in plots
word_explain <- "VOX" #
# Set time limits
my_limit <- c(as.POSIXct("2018-10-01 00:00:01"), as.POSIXct("2019-01-18 00:00:01"))
my_init <- as.POSIXlt("2018-10-10 00:00:00")
# open compressed file
# gunzip("eldiario/http!www.eldiario.es!!!!@2018-04-06T23:01:07.888847+00:00.gz",remove=FALSE)
# no hace falta, read_html lee el .gz sin necesidad de descomprimir
# ------- eldiario.es ----------------------
# reads html and stores it
page <- read_html("data/http!www.eldiario.es!!!!@2018-04-06T23:01:07.888847+00:00.gz")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- page %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
colnames(titles) <- "title"
titles$title <- as.character(titles$title)
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ------- elconfidencial ----------------------
# reads html and stores it
page <- read_html("data/http!www.elconfidencial.com!|!!!@2018-03-22T09:01:11.318750+00:00.gz")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- page %>% html_nodes("article h3 a") %>% html_text() %>% data.frame()
colnames(titles) <- "title"
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ------- elmundo ----------------------
# reads html and stores it
pageelmundo <- read_html("eldiario/http!www.elmundo.es!!!!@2018-04-07T19:01:02.620498+00:00_formated.html")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pageelmundo %>% html_nodes("main article h3 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
titles
colnames(titles) <- "title"
# --------- Create data frame with all the newspapers + time and date---------------
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ------- elpais ----------------------
# reads html and stores it
pageelpais <- read_html("data/http!www.elpais.com!!!!@2017-07-04T13:51:08.133418+00:00.gz")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pageelpais %>% html_nodes("article h2 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
colnames(titles) <- "title"
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ------- La Razón ----------------------
# reads html and stores it
pagelarazon <- read_html("data/http!www.larazon.es!|!!!@2018-03-30T07:01:03.998265+00:00.gz", to="UTF-8") #TODO correct encoding
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pagelarazon %>% html_nodes("article h2 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
colnames(titles) <- "title"
titles
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ---------For a list of pages---------------
# Read list of files. the list has been generated with this bash script:
# for f in *.gz; do echo "$f" >> mylist.txt; done
# Read list of files. The list has been generated with this bash script,
# when you are located in the directory with all the .gz files:
# for f in *.gz; do echo "$f" >> mylist.txt; done
# CHANGE THIS: write path where your mylist.file is located
list <- read.delim("data/mylist.txt")
# strapplyc("http!cadenaser.com!|!!!@2017-06-14T12:01:22.057046+00:00.gz", "http!([a-z]{1,61}.[a-zA-Z]{2,})", simplify = TRUE)
# test <- "https!www.cadenaser.com!|!!!@2017-06-14T12:01:22.057046+00:00.gz"
# strapplyc(test, "[a-z]*!(.*)", simplify = TRUE)
# strapplyc(as.character(test), "[a-z]*!([a-z]{1,61}.[a-zA-Z]{2,})", simplify = TRUE)
# add name to the one column file
names(list) <- "urls"
# extract name of newspaper
list$urls <- sub("file './", "", list$urls )
list$urls <- sub("'", "", list$urls)
list$newspaper <- strapplyc( as.character(list$urls), "[a-z]*!([a-z]{1,61}.[a-zA-Z]{2,})", simplify = TRUE)
# list$newspaper <- as.factor(list$newspaper) No funciona convertirlo a factor
list$newspaper <- sub("www.", "", list$newspaper)
......@@ -143,20 +51,31 @@ list$hour <- as.numeric(strapplyc( as.character(list$urls), ".*@[0-9]*-[0-9]*-[0
list$date <- as.Date( paste(list$day,"/",list$month,"/",list$year,sep = "" ), "%d/%m/%Y")
list$timestamp <- as.POSIXlt( paste(list$year,"-",list$month,"-",list$day," ",list$hour,":00:00", sep = "" ))
# Save created list of front pages
save(list,file="data/list.Rda")
# You can avoid all of the above and just load the existing file
load("data/list.Rda")
# Check visually if files exists
ggplot(list[list$newspaper == "larazon", ]) +
geom_point(aes(x=date,y=month), alpha = 0.005)
# Process home pages ---------------------------
# Create list of selected pages. Select timeframe, newspapers
selected <- list[(list$newspaper == "eldiario" | list$newspaper == "elconfidencial" |
list$newspaper == "elpais"| list$newspaper == "larazon" | list$newspaper == "elespanol") &
list$date > "2018-03-20", ]
list$date > "2018-10-01", ]
# Create results dataframe
results <- ""
results <- data.frame(matrix(ncol = 1,nrow = nrow(selected) ))
names(results) <- c("newspaper")
# Loop to count how many titles per homepages have certain words
for (i in 1:nrow(selected)) {
page <- read_html( paste("data/",selected$urls[i], sep = "") )
page <- read_html( paste("../storytracker/data/",selected$urls[i], sep = "") )
if ( selected$newspaper[i] == "eldiario" | selected$newspaper[i] == "elpais" | selected$newspaper[i] == "larazon") {
# eldiario
......@@ -178,10 +97,10 @@ for (i in 1:nrow(selected)) {
# Results
# number of articles that contain words
n_selected_news<- nrow(selected_news)
print(paste("nº noticias Cifuentes:",n_selected_news))
print(paste("nº noticias con las palabras:",n_selected_news))
# print(selected_news)
# Percentage of articles that contain words
percent <- n_selected_news / n_news * 100
percent <- round(n_selected_news / n_news * 100, digits = 2)
results$newspaper[i] <- selected$newspaper[i]
# results$date[i] <- paste(selected$day[i],"/",selected$month[i],"/",selected$year[i],sep = "" )
......@@ -194,35 +113,44 @@ for (i in 1:nrow(selected)) {
results$n_selected_news[i] <- as.integer(n_selected_news)
results$percent[i] <- percent
print(paste("day:",selected$day[i],"hour:",selected$hour[i],selected$newspaper[i]))
print(paste("year:",selected$year[i], "month:",selected$month[i], "day:",selected$day[i],"hour:",selected$hour[i],selected$newspaper[i]))
}
# creates time stampt
results$date <- as.POSIXlt( paste(results$year,"-",results$month,"-",results$day," ",results$hour,":00:00", sep = "" ))
save(results,file="data/results.Rda")
load("data/results.Rda")
# -----------Plots------------
# Save results
save(results,file="data/results-vox-01.Rda")
# Load other results
load("data/results-cifuentes-01.Rda")
# -----------Plot restults------------
# Plot para un único periódico
ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,100)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
# geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
# geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = "eldiario.es: noticias sobre Cifuentes en portada (total, total selected, %)")
labs(title = paste("eldiario.es: noticias en portada.",sep = ""))
ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,100)) +
geom_line(aes(x=date, y=n_selected_news),color="#000000") +
# geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
# geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = paste("eldiario.es: noticias sobre ",word_explain,"Cifuentes en portada (total, total selected, %)",sep = ""))
ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,100)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
# geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
# geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = "eldiario.es: nº noticias en portada") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-09 00:00:01")))
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = my_limit)
ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,100)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
# geom_line(aes(x=date, y=percent),color="#0000DD") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-09 00:00:01"))) +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = my_limit) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
geom_text(aes(x = as.POSIXlt("2018-03-25 00:00:00"),
y = 19, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
......@@ -241,7 +169,7 @@ ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,30)) +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=percent),color="#0000DD") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d",
limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-09 00:00:01"))) +
limits = my_limit) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
geom_text(aes(x = as.POSIXlt("2018-03-27 00:00:00"),
y = 1, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
......@@ -261,7 +189,7 @@ ggplot(data=results[results$newspaper=="elconfidencial",]) + ylim(c(0,10)) +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=percent),color="#0000DD") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d",
limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-09 00:00:01"))) +
limits = my_limit) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
geom_text(aes(x = as.POSIXlt("2018-03-27 00:00:00"),
y = 5, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
......@@ -281,7 +209,7 @@ ggplot(data=results[results$newspaper=="elconfidencial",]) + ylim(c(0,10)) +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=percent),color="#0000DD") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d",
limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-09 00:00:01"))) +
limits = my_limit) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
geom_text(aes(x = as.POSIXlt("2018-03-27 00:00:00"),
y = 5, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
......@@ -299,7 +227,7 @@ ggplot(data=results[results$newspaper=="elconfidencial",]) + ylim(c(0,120)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
# geom_line(aes(x=date, y=percent),color="#0000DD") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-09 00:00:01"))) +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = my_limit) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
geom_text(aes(x = as.POSIXlt("2018-03-25 00:00:00"),
y = 19, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
......@@ -381,15 +309,15 @@ ggplot( ) + ylim(c(0,30)) +
# # legend.position = "bottom",
# legend.text = element_text(size=15) ) +
scale_y_continuous(breaks=seq(0,30,5)) +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-12 12:00:01"))) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 11, label = "eldiario.es"), color = "#66c2a5", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 1, label = "larazon.es"), color = "#e78ac3", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 3, label = "elConfidencial.es"), color = "#fc8d62", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 5, label = "elespanol.com"), color = "#a6d854", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 2, label = "elpais.com"), color = "#8da0cb", alpha=1, hjust = 0) +
scale_x_datetime(date_breaks = "1 month", date_labels = "%m", limits = my_limit) +
geom_text(aes(x = my_init, y = 11, label = "eldiario.es"), color = "#66c2a5", alpha=1, hjust = 0) +
geom_text(aes(x = my_init, y = 13, label = "larazon.es"), color = "#e78ac3", alpha=1, hjust = 0) +
geom_text(aes(x = my_init, y = 15, label = "elConfidencial.es"), color = "#fc8d62", alpha=1, hjust = 0) +
geom_text(aes(x = my_init, y = 17, label = "elespanol.com"), color = "#a6d854", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-10-10 00:00:00"), y = 19, label = "elpais.com"), color = "#8da0cb", alpha=1, hjust = 0) +
ylab ("% de noticias en portada") +
labs(title = "Porcentaje de noticias sobre Cifuentes en portada periódicos digitales",
subtitle = "21 marzo - 9 abril 2018. Datos y visualización: numeroteca.org",
labs(title = paste("Porcentaje de noticias sobre",word_explain,"en portada periódicos digitales"),
subtitle = "01 oct 2018 - 18 enero 2019. Datos y visualización: numeroteca.org",
x = "Días",
y = "%",
caption = "")
......@@ -417,56 +345,24 @@ ggplot( ) + ylim(c(0,24)) +
# # legend.position = "bottom",
# legend.text = element_text(size=15) ) +
# scale_y_continuous(breaks=seq(0,30,5)) +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-12 12:00:01"))) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 8, label = "eldiario.es"), color = "#66c2a5", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 1, label = "larazon.es"), color = "#e78ac3", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 3, label = "elConfidencial.es"), color = "#fc8d62", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 5, label = "elespanol.com"), color = "#a6d854", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 2, label = "elpais.com"), color = "#8da0cb", alpha=1, hjust = 0) +
labs(title = "Número de noticias sobre Cifuentes en portada periódicos digitales",
subtitle = "21 marzo - 9 abril 2018. Datos y visualización: numeroteca.org",
scale_x_datetime(date_breaks = "1 month", date_labels = "%m", limits = my_limit) +
geom_text(aes(x = my_init, y = 23, label = "eldiario.es"), color = "#66c2a5", alpha=1, hjust = 0) +
geom_text(aes(x = my_init, y = 11, label = "larazon.es"), color = "#e78ac3", alpha=1, hjust = 0) +
geom_text(aes(x = my_init, y = 17, label = "elConfidencial.es"), color = "#fc8d62", alpha=1, hjust = 0) +
geom_text(aes(x = my_init, y = 20, label = "elespanol.com"), color = "#a6d854", alpha=1, hjust = 0) +
geom_text(aes(x = my_init, y = 14, label = "elpais.com"), color = "#8da0cb", alpha=1, hjust = 0) +
labs(title = paste("Número de noticias sobre",word_explain,"en portada periódicos digitales"),
subtitle = "01 oct 2018 - 18 enero 2019. Datos y visualización: numeroteca.org",
x = "Días",
y = "nº noticias en portada",
caption = "")
# -------- La razón test ----------
# for (i in 1:nrow(selected)) {
# if ( selected$newspaper[i] == "larazon") {
# page <- read_html( paste("data/",selected$urls[i], sep = "") )
#
# titles <- page %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
#
# colnames(titles) <- "title"
# # titles$title <- as.character(titles$title)
# # total of articles with link
# n_news <- nrow(titles)
# print(paste("nº noticias:",n_news))
#
# # select news that contain cerating word
# selected_news <- data.frame(titles[grepl(word, titles$title),])
# print(selected_news)
# # Results
# # number of articles that contain words
# n_selected_news<- nrow(selected_news)
# print(n_selected_news)
# print(paste("nº noticias Cifuentes:",n_selected_news))
# # Percentage of articles that contain words
#
# print(paste("day:",selected$day[i],"hour:",selected$hour[i],selected$newspaper[i]))
# }
# }
# -------- Paper newspaer front pages ---------
# -------- Analysis and comparision with Pageonex.com paper front pages data ---------
library("rjson")
# Get json from pageonex.com
json_file <- "http://pageonex.com/numeroteca/tfm-cifuentes/export.json"
json_data <- fromJSON(paste(readLines(json_file), collapse=""))
# Create empty dataf rame
df <- data.frame(matrix( ncol = 10,nrow = length(json_data$dates) ))
# fill dataframe with pageonex data
......@@ -528,3 +424,134 @@ ggplot(data=portadas[portadas$variable=="media",]) +
portadas$timestamp <- as.POSIXlt( paste(portadas$date," ","00:00:00", sep = "" ))
summary(portadas)
class(portadas$timestamp)
# Testing parsing news in different newspapers ---------------------------
# ------- eldiario.es ----------------------
# reads html and stores it
page <- read_html("data/http!www.eldiario.es!!!!@2018-04-06T23:01:07.888847+00:00.gz")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- page %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
colnames(titles) <- "title"
titles$title <- as.character(titles$title)
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ------- elconfidencial ----------------------
# reads html and stores it
page <- read_html("data/http!www.elconfidencial.com!|!!!@2018-03-22T09:01:11.318750+00:00.gz")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- page %>% html_nodes("article h3 a") %>% html_text() %>% data.frame()
colnames(titles) <- "title"
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ------- elmundo ----------------------
# reads html and stores it
pageelmundo <- read_html("eldiario/http!www.elmundo.es!!!!@2018-04-07T19:01:02.620498+00:00_formated.html")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pageelmundo %>% html_nodes("main article h3 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
titles
colnames(titles) <- "title"
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ------- elpais ----------------------
# reads html and stores it
pageelpais <- read_html("data/http!www.elpais.com!!!!@2017-07-04T13:51:08.133418+00:00.gz")
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pageelpais %>% html_nodes("article h2 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
colnames(titles) <- "title"
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# ------- La Razón ----------------------
# reads html and stores it
pagelarazon <- read_html("data/http!www.larazon.es!|!!!@2018-03-30T07:01:03.998265+00:00.gz", to="UTF-8") #TODO correct encoding
# gets all the text in article titles. All articles are in h2 except the comics.
titles <- pagelarazon %>% html_nodes("article h2 a") %>% html_text() %>% data.frame() #TODO NO FUNCIONA
colnames(titles) <- "title"
titles
# total of articles with link
n_news <- nrow(titles)
# select news that contain cerating word
select_news <- data.frame(titles[grepl(word, titles$title),])
# Results
# number of articles that contain words
n_select_news<- nrow(select_news)
# Percentage of articles that contain words
percent <- n_select_news / n_news * 100
# -------- La razón test ----------
# for (i in 1:nrow(selected)) {
# if ( selected$newspaper[i] == "larazon") {
# page <- read_html( paste("data/",selected$urls[i], sep = "") )
#
# titles <- page %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
#
# colnames(titles) <- "title"
# # titles$title <- as.character(titles$title)
# # total of articles with link
# n_news <- nrow(titles)
# print(paste("nº noticias:",n_news))
#
# # select news that contain cerating word
# selected_news <- data.frame(titles[grepl(word, titles$title),])
# print(selected_news)
# # Results
# # number of articles that contain words
# n_selected_news<- nrow(selected_news)
# print(n_selected_news)
# print(paste("nº noticias Cifuentes:",n_selected_news))
# # Percentage of articles that contain words
#
# print(paste("day:",selected$day[i],"hour:",selected$hour[i],selected$newspaper[i]))
# }
# }
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment