Commit b2d1d843 authored by numeroteca's avatar numeroteca

add paper front page data from PageOneX

parent effee153
......@@ -200,6 +200,7 @@ for (i in 1:nrow(selected)) {
# creates time stampt
results$date <- as.POSIXlt( paste(results$year,"-",results$month,"-",results$day," ",results$hour,":00:00", sep = "" ))
save(results,file="data/results.Rda")
load("data/results.Rda")
# -----------Plots------------
......@@ -388,102 +389,142 @@ ggplot( ) + ylim(c(0,30)) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 2, label = "elpais.com"), color = "#8da0cb", alpha=1, hjust = 0) +
ylab ("% de noticias en portada") +
labs(title = "Porcentaje de noticias sobre Cifuentes en portada periódicos digitales",
subtitle = "21 marzo - 9 abril 2018. Datos: numeroteca.org",
subtitle = "21 marzo - 9 abril 2018. Datos y visualización: numeroteca.org",
x = "Días",
y = NULL,
y = "%",
caption = "")
# Plot para varios periódico
ggplot( ) + ylim(c(0,24)) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
geom_line(data=results[results$newspaper=="eldiario",], aes(x=date, y=n_selected_news, group=newspaper),color="#66c2a5",size=0.7) +
geom_line(data=results[results$newspaper=="elconfidencial",], aes(x=date, y=n_selected_news, group=newspaper),color="#fc8d62",size=0.7) +
geom_line(data=results[results$newspaper=="elpais",], aes(x=date, y=n_selected_news, group=newspaper),color="#8da0cb",size=0.7) +
geom_line(data=results[results$newspaper=="larazon",], aes(x=date, y=n_selected_news, group=newspaper),color="#e78ac3",size=0.7) +
geom_line(data=results[results$newspaper=="elespanol",], aes(x=date, y=n_selected_news, group=newspaper),color="#a6d854",size=0.7) +
labs(title = "Número de noticias sobre Cifuentes en portada (cada hora)") +
xlab("Días") +
ylab("nº noticias en portadas") +
theme(axis.text.y = element_text(size=10),
# axis.title.y=element_blank(),
axis.ticks.y =element_blank(),
axis.ticks.x =element_blank(),
axis.text.x=element_text(size=9),
axis.title.x=element_text(size=11),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill="white"),
panel.grid.major.y = element_line( size=.1, color="grey" ),
# legend.position = "bottom",
legend.text = element_text(size=15) ) +
# labs(title = "Número de noticias sobre Cifuentes en portada (cada hora)") +
# xlab("Días") +
# ylab("nº noticias en portadas") +
# theme(axis.text.y = element_text(size=10),
# # axis.title.y=element_blank(),
# axis.ticks.y =element_blank(),
# axis.ticks.x =element_blank(),
# axis.text.x=element_text(size=9),
# axis.title.x=element_text(size=11),
# panel.grid.minor = element_blank(),
# panel.background = element_rect(fill="white"),
# panel.grid.major.y = element_line( size=.1, color="grey" ),
# # legend.position = "bottom",
# legend.text = element_text(size=15) ) +
# scale_y_continuous(breaks=seq(0,30,5)) +
scale_x_datetime(date_breaks = "1 day", date_labels = "%d", limits = c(as.POSIXct("2018-03-21 00:00:01"), as.POSIXct("2018-04-12 12:00:01"))) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 8, label = "eldiario.es"), color = "#66c2a5", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 1, label = "larazon.es"), color = "#e78ac3", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 3, label = "elConfidencial.es"), color = "#fc8d62", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 5, label = "elespanol.com"), color = "#a6d854", alpha=1, hjust = 0) +
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 2, label = "elpais.com"), color = "#8da0cb", alpha=1, hjust = 0)
geom_text(aes(x = as.POSIXlt("2018-04-10 00:00:00"), y = 2, label = "elpais.com"), color = "#8da0cb", alpha=1, hjust = 0) +
labs(title = "Número de noticias sobre Cifuentes en portada periódicos digitales",
subtitle = "21 marzo - 9 abril 2018. Datos y visualización: numeroteca.org",
x = "Días",
y = "nº noticias en portada",
caption = "")
# -------- La razón test ----------
# for (i in 1:nrow(selected)) {
# if ( selected$newspaper[i] == "larazon") {
# page <- read_html( paste("data/",selected$urls[i], sep = "") )
#
# titles <- page %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
#
# colnames(titles) <- "title"
# # titles$title <- as.character(titles$title)
# # total of articles with link
# n_news <- nrow(titles)
# print(paste("nº noticias:",n_news))
#
for (i in 1:nrow(selected)) {
if ( selected$newspaper[i] == "larazon") {
page <- read_html( paste("data/",selected$urls[i], sep = "") )
# # select news that contain cerating word
# selected_news <- data.frame(titles[grepl(word, titles$title),])
# print(selected_news)
# # Results
# # number of articles that contain words
# n_selected_news<- nrow(selected_news)
# print(n_selected_news)
# print(paste("nº noticias Cifuentes:",n_selected_news))
# # Percentage of articles that contain words
#
# print(paste("day:",selected$day[i],"hour:",selected$hour[i],selected$newspaper[i]))
# }
# }
titles <- page %>% html_nodes("article h2 a") %>% html_text() %>% data.frame()
colnames(titles) <- "title"
# titles$title <- as.character(titles$title)
# total of articles with link
n_news <- nrow(titles)
print(paste("nº noticias:",n_news))
# select news that contain cerating word
selected_news <- data.frame(titles[grepl(word, titles$title),])
print(selected_news)
# Results
# number of articles that contain words
n_selected_news<- nrow(selected_news)
print(n_selected_news)
print(paste("nº noticias Cifuentes:",n_selected_news))
# Percentage of articles that contain words
print(paste("day:",selected$day[i],"hour:",selected$hour[i],selected$newspaper[i]))
}
}
theme_min = function (size=10, font=NA, face='plain',
panelColor=backgroundColor, axisColor='#999999',
gridColor=gridLinesColor, textColor='black')
{
theme_text = function(...)
ggplot2::theme_text(family=font, face=face, colour=textColor,
size=size, ...)
opts(
axis.text.x = theme_text(),
axis.text.y = theme_text(),
axis.line = theme_blank(),
axis.ticks = theme_segment(colour=axisColor, size=0.25),
panel.border = theme_rect(colour=backgroundColor),
legend.background = theme_blank(),
legend.key = theme_blank(),
legend.key.size = unit(1.5, 'lines'),
legend.text = theme_text(hjust=0),
legend.title = theme_text(hjust=0),
panel.background = theme_rect(fill=panelColor, colour=NA),
panel.grid.major = theme_line(colour=gridColor, size=0.33),
panel.grid.minor = theme_blank(),
strip.background = theme_rect(fill=NA, colour=NA),
strip.text.x = theme_text(hjust=0),
strip.text.y = theme_text(angle=-90),
plot.title = theme_text(hjust=0),
plot.margin = unit(c(0.1, 0.1, 0.1, 0.1), 'lines'))
# -------- Paper newspaer front pages ---------
library("rjson")
# Get json from pageonex.com
json_file <- "http://pageonex.com/numeroteca/tfm-cifuentes/export.json"
json_data <- fromJSON(paste(readLines(json_file), collapse=""))
# Create empty dataf rame
df <- data.frame(matrix( ncol = 10,nrow = length(json_data$dates) ))
# fill dataframe with pageonex data
for (i in 1:length(json_data$dates)) {
df[i,] <- data.frame(json_data$data[[i]])
}
##Create a custom font type. Could be 'F', 'TEST', whatever
windowsFonts(F = windowsFont('Wide Latin'))
# Make column and row names nicer
colnames(df) <- json_data$media
colnames(df)[10] <- "media"
rownames(df) <- json_data$dates
# create variable with dates
df$date <- rownames(df)
library(reshape)
portadas <- melt(df, id=c("date"))
portadas$value <- portadas$value*100
portadas$date <- as.Date(portadas$date, "%Y-%m-%d")
class(portadas$date)
# Plot para varios periódico
ggplot(data=portadas) +
geom_line(aes(x=date, y=value, group=variable),color="#000000",size=0.2) +
labs(title = "% portadas dedicada a escándalo Cifuentes en portada")
ggplot( data=portadas[portadas$variable=="media",],aes(x = date)) +
geom_bar(aes(weight = value)) +
labs(title = "Media % portada dedicado a escándalo Cifuentes")
ggplot( data=portadas[!portadas$variable=="media",],aes(x = date)) +
geom_bar(aes(weight = value, fill=variable)) +
labs(title = "% portada dedicado a escándalo Cifuentes")
ggplot( data=portadas[portadas$variable=="La Razón",],aes(x = date)) +
geom_bar(aes(weight = value)) +
labs(title = "La Razón: % portada dedicado a escándalo Cifuentes")
ggplot( ) +
geom_bar(data=portadas[portadas$variable=="La Razón",], aes(x = timestamp,weight = value),fill="#A74a83") +
geom_line(data=results[results$newspaper=="larazon",], aes(x=date, y=percent, group=newspaper),color="#e78ac3",size=0.7) +
labs(title = "La Razón: % portada papel vs % noticias en digital dedicado a escándalo Cifuentes")
ggplot( ) +
geom_bar(data=portadas[portadas$variable=="El Pa",], aes(x = timestamp,weight = value),fill="#A74a83") +
geom_line(data=results[results$newspaper=="larazon",], aes(x=date, y=percent, group=newspaper),color="#e78ac3",size=0.7) +
labs(title = "La Razón: % portada papel vs % noticias en digital dedicado a escándalo Cifuentes")
ggplot( data=portadas,aes(x = variable) ) +
geom_bar(aes(weight = value/nrow(df))) +
labs(title = "% dedicado a escándalo Cifuentes en portada") +
coord_flip()
##and insert this line of code into the original code I list above:
+ theme_min(font='F', size=10)
# Plot de la media
ggplot(data=portadas[portadas$variable=="media",]) +
geom_line(aes(x=date, y=value, group=variable),color="#000000",size=0.2) +
labs(title = "% portadas dedicada a escándalo Cifuentes en portada")
portadas$timestamp <- as.POSIXlt( paste(portadas$date," ","00:00:00", sep = "" ))
summary(portadas)
class(portadas$timestamp)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment