Commit 81217408 authored by numeroteca's avatar numeroteca
Browse files

add opinion data and compare with Spanish policy agenda news data

parent ab274fe5
......@@ -522,7 +522,7 @@ pes <- evol %>% group_by(p) %>% summarise(
) %>% arrange( desc(count) )
evol_spain <- evol %>% group_by(date,p) %>% summarise(
count_p = sum(count_p),
count_p = sum( replace_na(count_p,0) ),
count_total = sum(count_total),
perc = round( count_p / count_total * 100 , digits = 1)
)
......@@ -545,6 +545,10 @@ evol_spain %>% filter( p == "NS/NC consolidated") %>% mutate(
legend.position = "top"
)
saveRDS(evol_spain, file = "data/output/barometro-ccaa-3problems_spain.rds")
write.csv(xx, file = "data/output/barometro-ccaa-3problems_spain.csv", row.names = FALSE)
# number of questions per barometro per CCAA-----
# TODO: why is it filtering by previous to 2017-12-01 ?
xx <- evol %>% select( date, CCAA, count_total) %>% distinct() %>% group_by(CCAA) %>% filter ( date < as.Date( "2017-12-01")) %>% summarise(
......
......@@ -23,7 +23,7 @@ elmundo <- read_excel("~/data/spanish-policy-agendas/MeidaElMundoSPAPWeb.xlsx",
table(elpais$year)
table(elpais$month)
# codebook
# codebook
# Loads codebok and add Major topic to generic categories like "Others" and "General"
codebook <- read.delim("~/data/spanish-policy-agendas/codebook-spanish-policy-agendas.csv",sep = ",") %>% mutate(
subtopic_name = ifelse( subtopic_name== "Others", paste("Others.",topic_name), subtopic_name),
......@@ -106,8 +106,16 @@ by_year_corruption %>%
) +
facet_wrap(~newspaper)
# C2. by year month ------------
# by month and topic
# C2. News by year month and topic ------------
# numer of news by month and newspaper
news_by_month <- news %>%
group_by(year,month,newspaper) %>% summarise(
total_news = n()
) %>% mutate(
date = as.Date(paste0(year,"-",month,"-",01))
)
# number of news by month and topic
by_month <- news %>%
# filter ( newspaper == "El País") %>%
group_by(year,month,newspaper,topic_name) %>% summarise(
......@@ -117,7 +125,17 @@ by_month <- news %>%
date = as.Date(paste0(year,"-",month,"-",01))
)
# calculates % of news by month and topic
by_month <- left_join(
by_month %>% mutate( dunique = paste0(date,newspaper) ),
news_by_month %>% mutate( dunique = paste0(date,newspaper) ) %>% ungroup() %>% select(dunique, total_news) ,
by = "dunique"
) %>% mutate(
# calculate % of news per month among the total number of news in each month
perc = round( count / total_news * 100, digits = 1)
) %>% select(-dunique)
# Plot ------
by_month %>%
# filter( newspaper=="El País") %>%
ggplot( ) +
......@@ -156,45 +174,63 @@ by_month %>%
facet_wrap(~topic_name)
# by month and subtopic ----------------
# C3. News by year month and subtopic ------------
# numer of news by month and newspaper
news_by_month <- news %>%
group_by(year,month,newspaper) %>% summarise(
total_news = n()
) %>% mutate(
date = as.Date(paste0(year,"-",month,"-",28))
)
# number of news by month and subtopic
by_month <- news %>%
# filter ( newspaper == "El País") %>%
group_by(year,month,newspaper,subtopic_name) %>% summarise(
group_by(year,month,newspaper,topic_name, subtopic_name) %>% summarise(
count = n()
# majortopic = as.character(majortopic)
) %>% mutate(
date = as.Date(paste0(year,"-",month,"-",01))
# assign day of the month. I put 28, to not give problems, I'd preffer 30 or 31, the end of the month
# if changed the next left_join will not work unless 28 is also used in news_by_month
date = as.Date(paste0(year,"-",month,"-",28))
)
# calculates % of news by month and subtopic
by_month <- left_join(
by_month %>% mutate( dunique = paste0(date,newspaper) ),
news_by_month %>% mutate( dunique = paste0(date,newspaper) ) %>% ungroup() %>% select(dunique, total_news) ,
by = "dunique"
) %>% mutate(
# calculate % of news per month among the total number of news in each month
perc = round( count / total_news * 100, digits = 1)
) %>% select(-dunique)
# PLOTS --------
# Plot
by_month %>%
filter( newspaper=="El País") %>%
filter( subtopic_name == "Government procurement, contracts and corruption" ) %>%
ggplot( ) +
geom_line(
aes(date,count, group = subtopic_name, color= subtopic_name)
aes(date,perc, group = newspaper, color = newspaper),
size = .7
) +
geom_text_repel(data= by_month %>%
filter( newspaper=="El País") %>%
group_by(subtopic_name) %>% top_n(1, date) %>% filter( count >100),
aes(date+100, count,
label=paste(
substr(subtopic_name,1,10) ) ),
nudge_x = 1, # adjust the starting x position of the text label
size=4,
hjust=0,
color = "#666666",
family = "Roboto Condensed",
direction="y",
segment.size = 0.1,
segment.color="#777777"
geom_point(
aes(date,perc, color = newspaper),
size = 1.1
) +
scale_x_date(
limits = c(as.Date("1995-01-01"),as.Date("2016-12-31") ),
limits = c(as.Date("1996-01-01"),as.Date("2009-12-31") ),
expand = c(0,0)) +
le
# facet_wrap(~newspaper)
theme(
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_blank(),
# panel.grid.minor.y = element_blank(),
axis.ticks.x = element_line(color = "#777777"),
legend.position = "bottom"
)
# +
# facet_wrap(~newspaper,ncol = 1)
# Plot
by_month %>%
filter( newspaper=="El País") %>%
ggplot( ) +
......@@ -206,5 +242,183 @@ by_month %>%
facet_wrap(~subtopic_name)
# D. Load opinion data
opinion <- read.delim("data/output/barometro-ccaa-3problems.csv",sep = ",")
# D. Load opinion data------
# TODO: it can also be used the original data calculated by CIS www.cis.es/cis/export/sites/default/-Archivos/Indicadores/documentos_html/TresProblemas.html
# Whole Spain
opinion_spain <- readRDS(file = "data/output/barometro-ccaa-3problems_spain.rds")
# By CCAA
opinion <- read.delim("data/output/barometro-ccaa-3problems.csv",sep = ",") %>% mutate(
date = as.Date(date)
)
# plots ----
opinion_spain %>% mutate(
) %>% filter(p == "La corrupción y el fraude") %>% ungroup() %>% # CCAA == "Madrid" & date > as.Date("2017-01-01"
filter(date > as.Date("1994-12-31") & date < as.Date("2009-12-31") ) %>%
ggplot() +
geom_line( aes( date, perc, group=p) ) +
geom_point( aes( date, perc), size = 0.5 ) +
scale_x_date(
date_breaks = "2 years",
date_labels = "%Y",
limits= c(as.Date("1995-01-01"),as.Date("2009-12-31")))
opinion %>% mutate(
) %>% filter(p == "La corrupción y el fraude") %>% ungroup() %>% # CCAA == "Madrid" & date > as.Date("2017-01-01"
filter(date > as.Date("1994-12-31") & date < as.Date("2009-12-31") ) %>%
ggplot() +
geom_line( aes( date, perc, group=p) ) +
facet_wrap(~CCAA) +
scale_x_date(
date_breaks = "5 years",
date_labels = "%Y",
limits= c(as.Date("1995-01-01"),as.Date("2009-12-31")))
# E. Join News and Opinion data------
# Join both datasets
data <- left_join(
# 1. news data
by_month %>%
filter(
# only subtopic corruption
subtopic_name == "Government procurement, contracts and corruption"
) %>%
rename(
perc_news = perc
) %>%
# calculate average of both newspapers
arrange( date ) %>% group_by(date) %>% summarise(
count = sum(count),
total_news =sum(total_news)
) %>% ungroup( ) %>% mutate(
perc_news = round( count / total_news * 100, digits = 1)
) %>%
# create unique id to match
mutate( dunique = paste0( format(date,"%Y"), format(date, "%m") )),
# 2. opinion data
opinion_spain %>%
# only corruption
filter(p == "La corrupción y el fraude") %>% ungroup() %>%
# create unique id to match.
# TODO: We are matching by news coverage and opinion survey taking place the same month!
# is there a better way? Because this method is far from perfect.
mutate(
dunique = paste0( format(date,"%Y"), format(date, "%m") ),
# sum 5 days to survey date, as it usually last (TODO: research) 10 days
date = date + 5
) %>%
rename(
date_survey = date,
perc_opinion = perc
),
by = "dunique"
) %>% ungroup() %>% select( -dunique)
# process data and find closer relationships date of survey and date of news month
data <- data %>% filter(
# remove empty date survey data
!is.na(date_survey)
) %>%
# group_by(newspaper) %>%
arrange(date) %>%
mutate(
diff = date_survey - date,
date_lag1 = lag(date,1),
perc_news_lag1 = lag(perc_news,1),
diff_lag1 = date_survey - date_lag1,
# date_lag2 = lag(date,2),
# perc_news_lag2 = lag(perc_news,2)
) %>% mutate(
# The goal here is to make a better match between a month of news data and its survey
# if it is negative difference (survey is BEFORE news) take the lag date
diff_ok = ifelse( diff > 0, diff, diff_lag1),
# if corrected diff is less than 32 days and the diff is not negative use the corrected perc_news
perc_news_ok = ifelse( diff_ok < 32 & diff > 0, perc_news_lag1, perc_news),
)
# F. Calculate correlations ----
data <- data %>% filter( !is.na(perc_news_ok) )
shapiro.test(data$perc_news_ok)
shapiro.test(data$perc_opinion)
# calculate correlation
# https://econometricstutors.com/pearson-and-spearman-correlations-in-r/
cor(data$perc_news_ok, data$perc_opinion, method = "pearson")
cor(data$perc_news_ok, data$perc_opinion, method = "spearman")
library(GGally)
ggcorr(data)
ggcorr(data %>% select(-total_news,-count,-count_p,-count_total,-diff_ok,-perc_news),
nbreaks = 6,
label = TRUE,
label_size = 3,
color = "grey50"
)
# data_avg <- data %>% filter(
# !is.na(perc_opinion) &
# diff_ok < 30
# ) %>% arrange( date ) %>% group_by(date) %>% summarise(
# count = sum(count),
# total_news =sum(total_news)
# ) %>% ungroup( ) %>%
# mutate(
# perc_news = round( count / total_news * 100, digits = 1),
# # lag_news = lag(perc_news), # use the next day front page to see correlations
# # lag_news_2 = lag(perc_news,2), # use the next day front page to see correlations
# # lag_news_3 = lag(perc_news,3) # use the next day front page to see correlations
# )
# calculate average newspaper
ggpairs( data %>% filter(
!is.na(perc_opinion) &
diff_ok < 32
) %>% filter(
date > as.Date("2004-01-01")
# year == 2009
) %>% arrange( date ) %>%
mutate(
lag_news = lag(perc_news_ok,1), # use the next day front page to see correlations
),
# add regresion line (https://www.guru99.com/r-pearson-spearman-correlation.html)
lower = list(continuous = wrap("smooth",
alpha = 0.3
# size = 0.1
)
),
columns = c(15,16,9),
title = "News coverage on corruption and public opinion. 1997-2011. Spain",
aes( alpha = 0.4)
)
ggpairs(data %>% filter(
newspaper == "El País" & !is.na(perc_opinion)
) %>% filter(
date > as.Date("2008-01-01")
) %>% arrange( date ) %>% group_by(newspaper) %>%
mutate(
lag_news = lag(perc_news), # use the next day front page to see correlations
lag_news_2 = lag(perc_news,2), # use the next day front page to see correlations
lag_news_3 = lag(perc_news,3) # use the next day front page to see correlations
),
columns = c(9,15,16,17,18),
title = "buscando correlaciones",
aes( alpha = 0.4)
)
# H. TODO -----
# There are things to makea better experiment:
# * try using official calculated % opinion data by CIS (with their calculation)
# * try only using certain years and see evolution of correlation
# * try if when coverage is too low or there are great peaks it misssvirtue the calculation
# * try only for MAdrid and/or Barcelona data to see if there is a better fit
# * look for better match among date: how to make it more accurate?
# * calculate average by month not based in the naturakl months, but in weeks. First month 1,2,3,4. 2nd month; 2,3,4,5. 3rd month: 3,4,5,6.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment