Commit e7d6bcdf authored by numeroteca's avatar numeroteca

adds 6 searches (partidos), converts results to long format, facet plots by newspaper and partido

parent 9b3b9207
File mode changed from 100644 to 100755
......@@ -14,9 +14,36 @@ library(tidyverse) # for ggplot
# word <- "vox|Vox|VOX|Santiago Abascal|ortega smith|francisco serrano"
word1 <- "vox|Vox|VOX|Santiago Abascal|Abascal"
word2 <- "Podemos|Pablo Iglesias|Iglesias"
word3 <- "PP|Pablo Casado|Casado"
word4 <- "PSOE|Pedro Sánchez|Sánchez"
word5 <- "C's|Ciudadanos|Albert Rivera|Rivera"
word6 <- "PACMA|Silvia Barquero|Barquero"
word <- ""
word <- data.frame(matrix(ncol = 1,nrow = 6 ))
names(word) <- c("word")
word$word[1] <- word1
word$word[2] <- word2
word$word[3] <- word3
word$word[4] <- word4
word$word[5] <- word5
word$word[6] <- word6
# Select word to be displayed in plots
word1_explain <- "VOX" #
word2_explain <- "Podemos" #
word3_explain <- "PP" #
word4_explain <- "PSOE" #
word5_explain <- "Cs" #
word6_explain <- "PACMA" #
word$explain[1] <- word1_explain
word$explain[2] <- word2_explain
word$explain[3] <- word3_explain
word$explain[4] <- word4_explain
word$explain[5] <- word5_explain
word$explain[6] <- word6_explain
# Set time limits
my_limit <- c(as.POSIXct("2019-04-12 00:00:01"), as.POSIXct("2019-04-24 00:00:01"))
......@@ -91,65 +118,91 @@ for (i in 1:nrow(selected)) {
colnames(titles) <- "title"
# titles$title <- as.character(titles$title)
# total of articles with link
n_news <- nrow(titles)
print(paste("nº noticias:",n_news))
# select news that contain cerating word
selected_news1 <- data.frame(titles[grepl(word1, titles$title),])
selected_news2 <- data.frame(titles[grepl(word2, titles$title),])
# Results
# number of articles that contain words
n_selected_news1<- nrow(selected_news1)
n_selected_news2<- nrow(selected_news2)
print(paste("nº noticias con las palabras",word1_explain,n_selected_news1))
print(paste("nº noticias con las palabras",word2_explain,n_selected_news2))
# print(selected_news)
# Percentage of articles that contain words
percent1 <- round(n_selected_news1 / n_news * 100, digits = 2)
percent2 <- round(n_selected_news2 / n_news * 100, digits = 2)
# newspaper name
results$newspaper[i] <- selected$newspaper[i]
# results$date[i] <- paste(selected$day[i],"/",selected$month[i],"/",selected$year[i],sep = "" )
# results$date2[i] <- as.Date(selected$date[i])
# insert date and time
results$day[i] <- selected$day[i]
results$month[i] <- selected$month[i]
results$year[i] <- selected$year[i]
results$hour[i] <- selected$hour[i]
results$hour[i] <- selected$hour[i]
# total of articles with link
n_news <- nrow(titles)
print(paste("nº noticias:",n_news))
results$n_news[i] <- n_news
results$n_selected_news1[i] <- as.integer(n_selected_news1)
results$n_selected_news2[i] <- as.integer(n_selected_news2)
results$percent1[i] <- percent1
results$percent2[i] <- percent2
results$titles1[i] <- sapply(as.list(selected_news1), paste0, collapse=", ")
results$titles2[i] <- sapply(as.list(selected_news2), paste0, collapse=", ")
# select news that contain cerating
for (j in 1:nrow(word)) {
# for (j in 1:5) {
selected_news <- data.frame(titles[grepl(word$word[j], titles$title),])
# Results
# number of articles that contain words
n_selected_news <- nrow(selected_news)
print(paste("1 nº noticias con las palabras",word$explain[j],n_selected_news))
# print(selected_news)
# Percentage of articles that contain words
percent <- round(n_selected_news / n_news * 100, digits = 2)
print(paste("2 percent ",word$explain[j],percent))
results$a <- 0
results$b <- 0
results$c <- 0
results$d <- 0
results$e <- 0
results$f <- 0
results$aa <- 0
results$bb <- 0
results$cc <- 0
results$dd <- 0
results$ee <- 0
results$ff <- 0
results$aaa <- 0
results$bbb <- 0
results$ccc <- 0
results$ddd <- 0
results$eee <- 0
results$fff <- 0
# create column with variable names
names(results)[6+j] <- paste0("n_selected_news_",j,"_",word$explain[j])
names(results)[7+j+nrow(word)] <- paste0("percent_",j,"_",word$explain[j])
names(results)[8+j+2*nrow(word)] <- paste0("titles_",j,"_",word$explain[j])
# insert value in results$n_selected_news_word_explain
results[i,6+j] <- as.integer(n_selected_news)
results[i,7+j+nrow(word)] <- percent
results[i,8+j+2*nrow(word)] <- sapply(as.list(selected_news), paste0, collapse="; ")
}
print(paste("year:",selected$year[i], "month:",selected$month[i], "day:",selected$day[i],"hour:",selected$hour[i],selected$newspaper[i]))
}
results <- results %>% select(-a,-b,-c,-d,-e,-f,-aa,-bb,-cc,-dd,-ee,-ff,-aaa,-bbb,-ccc,-ddd,-eee,-fff)
results5 <- ""
results5 <- data.frame(matrix(ncol = 1,nrow = 5 ))
names(results5) <- c("newspaper")
x <- as.list(selected_news2)
results5$titles2[1] <- sapply(x, paste0, collapse="; ")
y <- as.character(selected_news1[1,1])
writeLines(iconv(readLines("tmp.html"), from = "ANSI_X3.4-1986", to = "UTF8"), "tmp2.html")
iconv(selected_news2[,1], 'utf-8', 'ascii', sub='')
# results5 <- ""
# results5 <- data.frame(matrix(ncol = 1,nrow = 5 ))
# names(results5) <- c("newspaper")
# x <- as.list(selected_news2)
# results5$titles2[1] <- sapply(x, paste0, collapse="; ")
#
# y <- as.character(selected_news1[1,1])
# writeLines(iconv(readLines("tmp.html"), from = "ANSI_X3.4-1986", to = "UTF8"), "tmp2.html")
# iconv(selected_news2[,1], 'utf-8', 'ascii', sub='')
# creates time stampt
results$date <- as.POSIXct( paste(results$year,"-",results$month,"-",results$day," ",results$hour,":00:00", sep = "" ))
# Save results
# save(results,file="data/results-vox-01.Rda")
save(results,file="data/results-vox'podemos-generales2019-a.Rda")
save(results,file="data/results-vox'podemos-generales2019-b-6-partidos.Rda")
# Load other results
# load("data/results-cifuentes-01.Rda")
# -----------Plot restults------------
# -----------Plot results para 1 o dos words buscadas ------------
# Plot para un único periódico
ggplot(data=results[results$newspaper=="eldiario",]) + ylim(c(0,100)) +
......@@ -404,6 +457,111 @@ ggplot( ) + ylim(c(0,24)) +
y = "nº noticias en portada",
caption = "")
# Plot 6 research words ---------------------
# first method -----
ggplot(data=results[results$newspaper=="eldiario",]) + #ylim(c(0,30)) +
geom_line(aes(x=date, y=percent_1_VOX),color="#00AA00") +
geom_line(aes(x=date, y=percent_2_Podemos),color="#AAAA00") +
geom_line(aes(x=date, y=percent_3_PP),color="#0000BB") +
geom_line(aes(x=date, y=percent_4_PSOE),color="#DD0022") +
geom_line(aes(x=date, y=percent_5_Cs),color="#FFAA44") +
geom_line(aes(x=date, y=percent_6_PACMA),color="#555555") +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d",
limits = my_limit) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
# geom_text(aes(x = as.POSIXct("2018-03-27 00:00:00"),
# y = 1, label = "nº noticias sobre Cifuentes"), family = "Roboto Condensed",
# color = "#FF0000", alpha=0.6, hjust = 0) +
# geom_text(aes(x = as.POSIXct("2018-03-27 00:00:00"),
# y = 15, label = "Porcentaje de noticias en portada", family = "Roboto Condensed"),
# color = "#0000DD", alpha=0.6, hjust = 0) +
labs(title = paste("eldiario.es: porcentaje noticias y nº noticias",word1_explain,"vs",word2_explain),
subtitle = "21 marzo - 9 abril 2018. Datos: numeroteca.org",
x = NULL,
y = NULL,
caption = "")
# converts to long format -----
names(results)
table(results$newspaper)
# porcentajes
results_long_per <- results[,c(1,25,13:18)] %>%
gather(key, value, -newspaper, -date)
# numero de noticias
results_long_n <- results[,c(1,25,7:12)] %>%
gather(key, value, -newspaper, -date)
# creates color palette
paltidos <- c("#7bb135", "#93336b", "#2a7db7", "#d01f1f", "#fa8619", "#999999")
# para un periodico -----
ggplot(data=results_long_n[results_long_n$newspaper=="eldiario",]) + #ylim(c(0,30)) +
geom_line(aes(x=date, y=value, color=key),size=0.4) +
geom_smooth(aes(x=date, y=value, color=key),size=1) +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d",
limits = my_limit) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
scale_color_manual(values=paltidos) +
labs(title = paste("eldiario.es: porcentaje noticias y nº noticias",word1_explain,"vs",word2_explain),
subtitle = "12 marzo - 14 abril 2019. Datos: numeroteca.org",
x = NULL,
y = NULL,
caption = "")
# para todos los periodicos -------
ggplot(data=results_long_per) +
geom_line(aes(x=date, y=value, color=key),size=0.1) +
geom_smooth(aes(x=date, y=value, color=key),size=1,se = FALSE) + #,span=0.3
scale_x_datetime(date_breaks = "1 day", date_labels = "%d",
limits = my_limit) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
scale_color_manual(values=paltidos) +
labs(title = paste("Porcentaje de noticias en página de inicio"),
subtitle = "12 abril - 24 abril 2019",
x = NULL,
y = NULL,
caption = "Datos: HomePageX. numeroteca.org") +
theme(
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank()
# legend.position = "bottom",
# axis.text.x = element_text(angle = 90, vjust = 0.4)
) +
facet_wrap(~newspaper)
# para todos los partidos -------
ggplot(data=results_long_n) +
geom_line(aes(x=date, y=value, color=newspaper),size=0.2) +
geom_smooth(aes(x=date, y=value, color=newspaper),size=1.3,se = FALSE) + #,span=0.3
scale_x_datetime(date_breaks = "1 day", date_labels = "%d",
limits = my_limit) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
scale_colour_brewer(palette = "Set2") +
labs(title = paste("Número de noticias en página de inicio"),
subtitle = "12 abril - 24 abril 2019",
x = NULL,
y = NULL,
caption = "Datos: HomePageX. numeroteca.org") +
theme(
panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.major.y = element_blank()
# legend.position = "bottom",
# axis.text.x = element_text(angle = 90, vjust = 0.4)
) +
facet_wrap(~key)
# -------- Analysis and comparision with Pageonex.com paper front pages data ---------
library("rjson")
# Get json from pageonex.com
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment