Commit 9b3b9207 authored by numeroteca's avatar numeroteca

add analysis to two sets of words (generales 2019, vox and Podemos)

parent 447ada82
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -11,13 +11,16 @@ library(tidyverse) # for ggplot
# Set search variables: words and datelimits -----
# word <- "Cifuentes|Javier Ramos|Enrique Álvarez Conde|Pablo Chico|María Teresa Feito|Alicia López de los Cobos|Cecilia Rosado|Clara Souto|Amalia Calonge|Universidad Rey Juan Carlos"
word <- "vox|Vox|VOX|Santiago Abascal|ortega smith|francisco serrano"
# word <- "vox|Vox|VOX|Santiago Abascal|ortega smith|francisco serrano"
word1 <- "vox|Vox|VOX|Santiago Abascal|Abascal"
word2 <- "Podemos|Pablo Iglesias|Iglesias"
# Select word to be displayed in plots
word_explain <- "VOX" #
word1_explain <- "VOX" #
word2_explain <- "Podemos" #
# Set time limits
my_limit <- c(as.POSIXct("2018-10-01 00:00:01"), as.POSIXct("2019-01-18 00:00:01"))
my_init <- as.POSIXlt("2018-10-10 00:00:00")
my_limit <- c(as.POSIXct("2019-04-12 00:00:01"), as.POSIXct("2019-04-24 00:00:01"))
my_init <- as.POSIXlt("2019-04-12 00:00:00")
# open compressed file
# gunzip("eldiario/http!www.eldiario.es!!!!@2018-04-06T23:01:07.888847+00:00.gz",remove=FALSE)
......@@ -29,7 +32,7 @@ my_init <- as.POSIXlt("2018-10-10 00:00:00")
# when you are located in the directory with all the .gz files:
# for f in *.gz; do echo "$f" >> mylist.txt; done
# CHANGE THIS: write path where your mylist.file is located
list <- read.delim("data/mylist.txt")
list <- read_csv("data/mylist.txt")
# add name to the one column file
names(list) <- "urls"
......@@ -66,7 +69,8 @@ ggplot(list[list$newspaper == "larazon", ]) +
# Create list of selected pages. Select timeframe, newspapers
selected <- list[(list$newspaper == "eldiario" | list$newspaper == "elconfidencial" |
list$newspaper == "elpais"| list$newspaper == "larazon" | list$newspaper == "elespanol") &
list$date > "2018-10-01", ]
# list$date > "2018-10-01", ]
list$date > "2019-04-11", ]
# Create results dataframe
results <- ""
......@@ -75,7 +79,7 @@ names(results) <- c("newspaper")
# Loop to count how many titles per homepages have certain words
for (i in 1:nrow(selected)) {
page <- read_html( paste("../storytracker/data/",selected$urls[i], sep = "") )
page <- read_html( paste("../storytracker/data-homepages/",selected$urls[i], sep = "") )
if ( selected$newspaper[i] == "eldiario" | selected$newspaper[i] == "elpais" | selected$newspaper[i] == "larazon") {
# eldiario
......@@ -92,15 +96,19 @@ for (i in 1:nrow(selected)) {
print(paste("nº noticias:",n_news))
# select news that contain cerating word
selected_news <- data.frame(titles[grepl(word, titles$title),])
selected_news1 <- data.frame(titles[grepl(word1, titles$title),])
selected_news2 <- data.frame(titles[grepl(word2, titles$title),])
# Results
# number of articles that contain words
n_selected_news<- nrow(selected_news)
print(paste("nº noticias con las palabras:",n_selected_news))
n_selected_news1<- nrow(selected_news1)
n_selected_news2<- nrow(selected_news2)
print(paste("nº noticias con las palabras",word1_explain,n_selected_news1))
print(paste("nº noticias con las palabras",word2_explain,n_selected_news2))
# print(selected_news)
# Percentage of articles that contain words
percent <- round(n_selected_news / n_news * 100, digits = 2)
percent1 <- round(n_selected_news1 / n_news * 100, digits = 2)
percent2 <- round(n_selected_news2 / n_news * 100, digits = 2)
results$newspaper[i] <- selected$newspaper[i]
# results$date[i] <- paste(selected$day[i],"/",selected$month[i],"/",selected$year[i],sep = "" )
......@@ -110,19 +118,36 @@ for (i in 1:nrow(selected)) {
results$year[i] <- selected$year[i]
results$hour[i] <- selected$hour[i]
results$n_news[i] <- n_news
results$n_selected_news[i] <- as.integer(n_selected_news)
results$percent[i] <- percent
results$n_selected_news1[i] <- as.integer(n_selected_news1)
results$n_selected_news2[i] <- as.integer(n_selected_news2)
results$percent1[i] <- percent1
results$percent2[i] <- percent2
results$titles1[i] <- sapply(as.list(selected_news1), paste0, collapse=", ")
results$titles2[i] <- sapply(as.list(selected_news2), paste0, collapse=", ")
print(paste("year:",selected$year[i], "month:",selected$month[i], "day:",selected$day[i],"hour:",selected$hour[i],selected$newspaper[i]))
}
results5 <- ""
results5 <- data.frame(matrix(ncol = 1,nrow = 5 ))
names(results5) <- c("newspaper")
x <- as.list(selected_news2)
results5$titles2[1] <- sapply(x, paste0, collapse="; ")
y <- as.character(selected_news1[1,1])
writeLines(iconv(readLines("tmp.html"), from = "ANSI_X3.4-1986", to = "UTF8"), "tmp2.html")
iconv(selected_news2[,1], 'utf-8', 'ascii', sub='')
# creates time stampt
results$date <- as.POSIXlt( paste(results$year,"-",results$month,"-",results$day," ",results$hour,":00:00", sep = "" ))
results$date <- as.POSIXct( paste(results$year,"-",results$month,"-",results$day," ",results$hour,":00:00", sep = "" ))
# Save results
save(results,file="data/results-vox-01.Rda")
# save(results,file="data/results-vox-01.Rda")
save(results,file="data/results-vox'podemos-generales2019-a.Rda")
# Load other results
load("data/results-cifuentes-01.Rda")
# load("data/results-cifuentes-01.Rda")
# -----------Plot restults------------
......@@ -133,57 +158,79 @@ ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,100)) +
# geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = paste("eldiario.es: noticias en portada.",sep = ""))
ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,100)) +
geom_line(aes(x=date, y=n_selected_news),color="#000000") +
ggplot(data=results[results$newspaper=="eldiario",]) + # ylim(c(0,100)) +
geom_line(aes(x=date, y=n_selected_news1),color="#000000") +
geom_line(aes(x=date, y=n_selected_news2),color="#FF0000") +
# geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
# geom_line(aes(x=date, y=percent),color="#0000DD") +
labs(title = paste("eldiario.es: noticias sobre ",word_explain,"Cifuentes en portada (total, total selected, %)",sep = ""))
labs(title = paste("eldiario.es: noticias sobre ",word1_explain,sep = ""))
ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,100)) +
ggplot(data=results[results$newspaper=="eldiario",]) + # ylim(c(0,100)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
# geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
# geom_line(aes(x=date, y=percent),color="#0000DD") +
geom_line(aes(x=date, y=n_selected_news1),color="#FF0000") +
geom_line(aes(x=date, y=percent1),color="#0000DD") +
labs(title = "eldiario.es: nº noticias en portada") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = my_limit)
ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,100)) +
ggplot(data=results[results$newspaper=="eldiario",]) + # ylim(c(0,100)) +
geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=n_selected_news1),color="#FF0000") +
# geom_line(aes(x=date, y=percent),color="#0000DD") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = my_limit) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
geom_text(aes(x = as.POSIXlt("2018-03-25 00:00:00"),
geom_text(aes(x = as.POSIXct("2018-03-25 00:00:00"),
y = 19, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
color = "#FF0000", alpha=0.6, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-03-25 00:00:00"),
geom_text(aes(x = as.POSIXct("2018-03-25 00:00:00"),
y = 45, label = "nº noticias en portada", family = "Roboto Condensed"),
color = "#000000", alpha=0.6, hjust = 0) +
labs(title = "eldiario.es: nº noticias en portada - noticias sobre Cifuentes",
labs(title = "eldiario.es: nº noticias en portada",
subtitle = "21 marzo - 9 abril 2018. numeroteca.org",
x = NULL,
y = NULL,
caption = "")
ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,30)) +
ggplot(data=results[results$newspaper=="eldiario",]) + #ylim(c(0,30)) +
# geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
geom_line(aes(x=date, y=percent),color="#0000DD") +
geom_line(aes(x=date, y=n_selected_news1),color="#FF0000") +
geom_line(aes(x=date, y=percent1),color="#0000DD") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d",
limits = my_limit) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
geom_text(aes(x = as.POSIXlt("2018-03-27 00:00:00"),
y = 1, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
color = "#FF0000", alpha=0.6, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-03-27 00:00:00"),
y = 15, label = "Porcentaje de noticias en portada", family = "Roboto Condensed"),
color = "#0000DD", alpha=0.6, hjust = 0) +
labs(title = "eldiario.es: porcentaje noticias y nº noticias en portada sobre Cifuentes",
# geom_text(aes(x = as.POSIXct("2018-03-27 00:00:00"),
# y = 1, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
# color = "#FF0000", alpha=0.6, hjust = 0) +
# geom_text(aes(x = as.POSIXct("2018-03-27 00:00:00"),
# y = 15, label = "Porcentaje de noticias en portada", family = "Roboto Condensed"),
# color = "#0000DD", alpha=0.6, hjust = 0) +
labs(title = paste("eldiario.es: porcentaje noticias y nº noticias",word1_explain),
subtitle = "21 marzo - 9 abril 2018. Datos: numeroteca.org",
x = NULL,
y = NULL,
caption = "")
ggplot(data=results[results$newspaper=="eldiario",]) + #ylim(c(0,30)) +
# geom_line(aes(x=date, y=n_news),color="#000000") +
# geom_line(aes(x=date, y=n_selected_news1),color="#FF0000") +
geom_line(aes(x=date, y=percent1),color="#AA0000") +
# geom_line(aes(x=date, y=n_selected_news2),color="#00FF00") +
geom_line(aes(x=date, y=percent2),color="#00AA00") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d",
limits = my_limit) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
# geom_text(aes(x = as.POSIXct("2018-03-27 00:00:00"),
# y = 1, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
# color = "#FF0000", alpha=0.6, hjust = 0) +
# geom_text(aes(x = as.POSIXct("2018-03-27 00:00:00"),
# y = 15, label = "Porcentaje de noticias en portada", family = "Roboto Condensed"),
# color = "#0000DD", alpha=0.6, hjust = 0) +
labs(title = paste("eldiario.es: porcentaje noticias y nº noticias",word1_explain,"vs",word2_explain),
subtitle = "21 marzo - 9 abril 2018. Datos: numeroteca.org",
x = NULL,
y = NULL,
caption = "")
ggplot(data=results[results$newspaper=="elconfidencial",]) + ylim(c(0,10)) +
# geom_line(aes(x=date, y=n_news),color="#000000") +
geom_line(aes(x=date, y=n_selected_news),color="#FF0000") +
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment