analizando-CIS.R 25.1 KB
Newer Older
1 2
# Analysis of CIS barometro data http://bilbaodatalab.wikitoki.org/wiki/barometro-del-cis/

3
# 1. Load libraries --------
4 5 6 7
# install.packages("foreign")
library(foreign)
library(tidyverse)

8
# Load data for single barometro------------------------
9 10 11
# 201801
# df <- read.spss("DA3203.sav", use.value.label=TRUE, to.data.frame=TRUE)
# 201905
12 13 14
# df <- read.spss("../../data/CIS/barometro/1905/3247.sav", use.value.label=TRUE, to.data.frame=TRUE)
# df <- read.spss("~/data/CIS/barometro/almacen/3203.sav", use.value.label=TRUE, to.data.frame=TRUE)
# df <- read.spss("~/data/CIS/barometro/por-email/3164.sav", use.value.label=TRUE, to.data.frame=TRUE)
15 16
# df <- read.spss("../../data/CIS/barometro/fid/FID_2293.sav", use.value.label=TRUE, to.data.frame=TRUE)

17
# 2. load CIS barometros, prepare list of files --------
18
# list of barometros with Id sent by email by CIS
19
cis_id <-  read.delim("data/original/barometros-cis.csv",sep = ",") %>% mutate(
20 21 22
  date = as.Date(estudio, "%d-%m-%Y"),
  id = id %>% as.character()
)
23
# files have been previously downloaded in zips and expanded in the path directory
24
cis_files <-  read.delim("data/original/files.csv",sep = ",", header = F) %>% rename ( name = V1 ) %>% mutate(
25 26 27
  id = name %>% str_replace_all(".sav","") %>% as.character()
)

28
# join the two datasets to have the name of the file
29 30 31 32
cis_files <- left_join(cis_files,
                       cis_id,
                       by = "id") %>% arrange( date)

33 34
# Download and load manual notation of files, that has been done manually in a spreadsheet
# where we indicate in a column the variable names of the chosen questions (three problems)
35 36 37 38 39 40
download.file("https://docs.google.com/spreadsheets/d/1xxlt8FnWanVzYkIQdU2yaWlE8-HUvnzVXSiE2QvNJRU/gviz/tq?tqx=out:csv&sheet=files", 
              "data/original/files_with_notes.csv")
files_notes <- read.delim("data/original/files_with_notes.csv",sep = ",") %>% mutate(
  date = as.Date(estudio, "%d-%m-%Y"),
  id = id %>% as.character()
)
41
# add the chosen variable names to each file
42 43 44 45 46 47
cis_files <- left_join(cis_files,
                       files_notes %>% select(id,p1,p2,p3),
                       by = "id") %>% arrange( date)

write.csv(cis_files, file = "data/output/cis_files.csv", row.names = FALSE)

48
# remove files without date
49 50 51
cis_files <- cis_files %>% filter( !is.na(date))


52
# 3. Select and load multiple barometro files ------------
53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103

# where are files stored
path <- "~/data/CIS/barometro/almacen/tmp/"

# remove if it hasn't got the questions
cis_files <- cis_files %>% filter( p1 != "" )

# iterate through all the files
for ( i in 1:nrow(cis_files) ) {
# for ( i in 1:8 ) {
  print("--------------------")
  print(paste(i,cis_files$name[i],cis_files$date[i]  ) )
  # create path to file
  data <- paste0(path, cis_files$name[i])
  # load data in the file
  df <- read.spss(data, use.value.label=TRUE, to.data.frame=TRUE)
  
  # chec if variable ESTU exists
  if ( "ESTU" %in% colnames(df) ) {
    df <- df %>% mutate(
      ESTU = as.character(ESTU)
    )
  } else {
    # if ESTU is not in the variables, insert the ID of the barometer
    df <- df %>% mutate(
      ESTU = cis_files$id[i],
      ESTU = as.character(ESTU)
    )
  }
    
  # if REGION exixts, rename it as CCAA 
  if ( "REGION" %in% colnames(df) ) {
      df <- df %>% rename(
        CCAA =  REGION
      )
    }
  
  # add date to data by taking it gtom cis_id dataframe
  df <- left_join(df,
                  cis_id %>% select(id,date),
                  by = c("ESTU"="id")
                  # ) %>% select( date, ESTU, CCAA, PROV, MUN, P701, P702, P703 )
  )
  # select the basic columns and the 3 questions
  # the true name of the question is specified in the online document https://docs.google.com/spreadsheets/d/1xxlt8FnWanVzYkIQdU2yaWlE8-HUvnzVXSiE2QvNJRU/edit#gid=0
  selected <- c( "date", "ESTU", "CCAA", "PROV", "MUN", cis_files$p1[i], cis_files$p2[i], cis_files$p3[i])
  
  df <- df %>% select(selected) %>% rename(
    p1 = cis_files$p1[i],
    p2 = cis_files$p2[i],
    p3 = cis_files$p3[i],
104 105 106 107
  ) %>% mutate(
    p1 = as.character(p1),
    p2 = as.character(p2),
    p3 = as.character(p3)
108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
  )
  
  # For the first file
  if ( i == 1) {
    print("opt 1")

    # loads df in the final exportdataframe "barometros"
    barometros <- df
    
    print(df$date[1])
    print(df$ESTU[1])
    
  } else {
    print("not i==1")

    barometros <- rbind( df, barometros)
  }
  
}


129
# 4.  check results --------
130 131 132 133 134 135 136 137 138 139 140

barplot ( table(barometros$date) , horiz = T, las = 1 ) 
barplot ( table(barometros$p1), horiz = T, las = 1 ) 

table(barometros$p1)

punos <- barometros %>% group_by(p1) %>% summarise(
  count = n()
) %>% arrange( desc(count) )


141 142
# barometros_test <- barometros

143 144 145
# 5. limipar nombres de CCAA ---------------
#  Investigar por qué hay que pasar dos veces esta parte del script para que funcione bien la limpieza

146 147 148 149 150 151 152 153 154 155 156 157 158 159
barometros <- barometros %>% mutate(
  CCAA = as.character(CCAA),
  CCAA =  CCAA %>% str_replace("\\{",""),
  CCAA =  CCAA %>% str_replace("\\}",""),
  CCAA =  CCAA %>% str_replace("\\(",""),
  CCAA =  CCAA %>% str_replace("\\)",""),
  CCAA =  CCAA %>% str_replace("  "," "),
  CCAA =  CCAA %>% str_replace("Euskadi","País Vasco"),
  CCAA =  CCAA %>% str_replace("País Vasco ","País Vasco"),
  CCAA =  CCAA %>% str_replace("Pais Vasco","País Vasco"),
  CCAA =  CCAA %>% str_replace("País vasco","País Vasco"),
  CCAA =  CCAA %>% str_replace("País vascoPV","País Vasco"),
  CCAA =  CCAA %>% str_replace("País VascoPV","País Vasco"),
  CCAA =  CCAA %>% str_replace("Comunidad Valenciana","Valencia"),
160 161
  CCAA =  CCAA %>% str_replace("Valencia CVal","Valencia"),
  CCAA =  CCAA %>% str_replace("Valencia ","Valencia"),
162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
  CCAA =  CCAA %>% str_replace("Comunitat Valenciana","Valencia"),
  CCAA =  CCAA %>% str_replace("Comunidad Valenciana CVal","Valencia"),
  CCAA =  CCAA %>% str_replace("Comunidad Valenciana ","Valencia"),
  CCAA =  CCAA %>% str_replace("Castilla y León ","Castilla y León"),
  CCAA =  CCAA %>% str_replace("Castilla y León CyL","Castilla y León"),
  CCAA =  CCAA %>% str_replace("Castilla-Leon","Castilla y León"),
  CCAA =  CCAA %>% str_replace("Castilla Y León","Castilla y León"),
  CCAA =  CCAA %>% str_replace("Castilla-león","Castilla y León"),
  CCAA =  CCAA %>% str_replace("Castilla-León","Castilla y León"),
  CCAA =  CCAA %>% str_replace("Castilla y LeónCyL","Castilla y León"),
  CCAA =  CCAA %>% str_replace("Castilla-Len","Castilla y León"),
  CCAA =  CCAA %>% str_replace(" Castilla - La Mancha","Castilla-La Mancha"),
  CCAA =  CCAA %>% str_replace("Castilla La Mancha","Castilla-La Mancha"),
  CCAA =  CCAA %>% str_replace("Castilla-la mancha","Castilla-La Mancha"),
  CCAA =  CCAA %>% str_replace("Castilla-la Mancha","Castilla-La Mancha"),
  CCAA =  CCAA %>% str_replace("Castilla-La Mancha C-M","Castilla-La Mancha"),
  CCAA =  CCAA %>% str_replace("Castilla-La Mancha ","Castilla-La Mancha"),
  CCAA =  CCAA %>% str_replace("Castilla-La Mancha","Castilla-La Mancha"),
  CCAA =  CCAA %>% str_replace("Castilla - La Mancha","Castilla-La Mancha"),
  CCAA =  CCAA %>% str_replace("RiojaLa","Rioja"),
  CCAA =  CCAA %>% str_replace("RiojaLa ","Rioja"),
  CCAA =  CCAA %>% str_replace("RiojaLa Rio","Rioja"),
  CCAA =  CCAA %>% str_replace("Rioja ","Rioja"),
  CCAA =  CCAA %>% str_replace("RiojaRio","Rioja"),
  CCAA =  CCAA %>% str_replace("La Rioja","Rioja"),
  CCAA =  CCAA %>% str_replace("NavarraCFNav","Navarra"),
  CCAA =  CCAA %>% str_replace("Navarra (Comunidad Foral de) ","Navarra"),
  CCAA =  CCAA %>% str_replace("Navarra (Comunidad Foral de)","Navarra"),
  CCAA =  CCAA %>% str_replace("Navarra ","Navarra"),
  CCAA =  CCAA %>% str_replace("Murcia Región de RMur","Murcia"),
  CCAA =  CCAA %>% str_replace("Murcia Región de ","Murcia"),
  CCAA =  CCAA %>% str_replace("Murcia Región de","Murcia"),
  CCAA =  CCAA %>% str_replace("Melilla ","Melilla"),
  CCAA =  CCAA %>% str_replace("MelillaCiudad Autónoma de ","Melilla"),
  CCAA =  CCAA %>% str_replace("MelillaCiudad Autónoma de","Melilla"),
  CCAA =  CCAA %>% str_replace("MelillaCiudad autónoma de","Melilla"),
  CCAA =  CCAA %>% str_replace("Melilla Ciudad Autónoma de  ","Melilla"),
  CCAA =  CCAA %>% str_replace("Melilla Ciudad Autónoma de","Melilla"),
  CCAA =  CCAA %>% str_replace("Melilla Ciudad autónoma de","Melilla"),
  CCAA =  CCAA %>% str_replace("Ceuta Ciudad Autónoma de ","Ceuta"),
  CCAA =  CCAA %>% str_replace("Ceuta Ciudad Autónoma de","Ceuta"),
  CCAA =  CCAA %>% str_replace("Ceuta Ciudad autónoma de","Ceuta"),
204 205
  CCAA =  CCAA %>% str_replace("Ceuta ","Ceuta"),
  CCAA =  CCAA %>% str_replace("MadridCMad","Madrid"),
206
  CCAA =  CCAA %>% str_replace("MadridCMad","Madrid"),
207
  CCAA =  CCAA %>% str_replace("MadridCMad","Madrid"),
208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
  CCAA =  CCAA %>% str_replace("Madrid ","Madrid"),
  CCAA =  CCAA %>% str_replace("MadridComunidad de","Madrid"),
  CCAA =  CCAA %>% str_replace("MadridComunidad de ","Madrid"),
  CCAA =  CCAA %>% str_replace("MadridComunidad de CMad","Madrid"),
  CCAA =  CCAA %>% str_replace("Madrid Comunidad de CMad","Madrid"),
  CCAA =  CCAA %>% str_replace("Madrid Comunidad de","Madrid"),
  CCAA =  CCAA %>% str_replace("E5remadura","Extremadura"),
  CCAA =  CCAA %>% str_replace("Extremadura Ext","Extremadura"),
  CCAA =  CCAA %>% str_replace("Extremadura ","Extremadura"),
  CCAA =  CCAA %>% str_replace("Galicia Gal","Galicia"),
  CCAA =  CCAA %>% str_replace("Galicia ","Galicia"),
  CCAA =  CCAA %>% str_replace("Andalucía ","Andalucía"),
  CCAA =  CCAA %>% str_replace("Andalucía ","Andalucía"),
  CCAA =  CCAA %>% str_replace("Andalucía And","Andalucía"),
  CCAA =  CCAA %>% str_replace("AndalucíaAnd","Andalucía"),
  CCAA =  CCAA %>% str_replace("Aragón ","Aragón"),
  CCAA =  CCAA %>% str_replace("Aragón Ara","Aragón"),
  CCAA =  CCAA %>% str_replace("AragónAra","Aragón"),
  CCAA =  CCAA %>% str_replace("Asturias ","Asturias"),
  CCAA =  CCAA %>% str_replace("Asturias Ast","Asturias"),
  CCAA =  CCAA %>% str_replace("AsturiasAst","Asturias"),
  CCAA =  CCAA %>% str_replace("AsturiasPrincipado de","Asturias"),
  CCAA =  CCAA %>% str_replace("AsturiasPrincipado de ","Asturias"),
  CCAA =  CCAA %>% str_replace("AsturiasPrincipado de Ast","Asturias"),
  CCAA =  CCAA %>% str_replace("Asturias Principado de","Asturias"),
  CCAA =  CCAA %>% str_replace("Asturias Principado de ","Asturias"),
  CCAA =  CCAA %>% str_replace("Asturias Principado de Ast","Asturias"),
  CCAA =  CCAA %>% str_replace("Baleares Islas","Baleares"),
  CCAA =  CCAA %>% str_replace("Cantabria Cant","Cantabria"),
  CCAA =  CCAA %>% str_replace("Cantabria Can","Cantabria"),
  CCAA =  CCAA %>% str_replace("Cantabria ","Cantabria"),
  CCAA =  CCAA %>% str_replace("Baleares ","Baleares"),
  CCAA =  CCAA %>% str_replace("Baleares Bal.","Baleares"),
  CCAA =  CCAA %>% str_replace("Balears Iles","Baleares"),
  CCAA =  CCAA %>% str_replace("Balears Illes","Baleares"),
  CCAA =  CCAA %>% str_replace("Baleares Bal.","Baleares"),
  CCAA =  CCAA %>% str_replace("BalearesBal.","Baleares"),
  CCAA =  CCAA %>% str_replace("Canarias ","Canarias"),
  CCAA =  CCAA %>% str_replace("CanariasCan","Canarias"),
  CCAA =  CCAA %>% str_replace("Canarias can","Canarias"),
  CCAA =  CCAA %>% str_replace("Cataluña Cat","Cataluña"),
  CCAA =  CCAA %>% str_replace("Cataluña ","Cataluña"),
250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
  # el estudi 3164 tiene cifras en vez de nombres de CCAA, pero ¿tiene resultados de Murcia?
  CCAA =  CCAA %>% str_replace("19","Melilla"),
  CCAA =  CCAA %>% str_replace("18","Ceuta"),
  CCAA =  CCAA %>% str_replace("17","Rioja"),
  CCAA =  CCAA %>% str_replace("16","País Vasco"),
  CCAA =  CCAA %>% str_replace("15","Navarra"),
  CCAA =  CCAA %>% str_replace("14","Murcia"),
  CCAA =  CCAA %>% str_replace("13","Madrid"),
  CCAA =  CCAA %>% str_replace("12","Galicia"),
  CCAA =  CCAA %>% str_replace("11","Extremadura"),
  CCAA =  CCAA %>% str_replace("10","Valencia"),
  CCAA =  CCAA %>% str_replace("9","Cataluña"),
  CCAA =  CCAA %>% str_replace("8","Castilla y León"),
  CCAA =  CCAA %>% str_replace("7","Castilla-La Mancha"),
  CCAA =  CCAA %>% str_replace("6","Cantabria"),
  CCAA =  CCAA %>% str_replace("5","Canarias"),
  CCAA =  CCAA %>% str_replace("4","Baleares"),
  CCAA =  CCAA %>% str_replace("3","Asturias"),
  CCAA =  CCAA %>% str_replace("2","Aragón"),
  CCAA =  CCAA %>% str_replace("1","Andalucía"),
270 271 272 273
  
  CCAA = as.factor(CCAA)
)

274 275 276 277 278 279 280
barometros <- barometros %>% mutate(
  CCAA = as.character(CCAA),
  CCAA =  CCAA %>% str_replace("MadridCMad","Madrid"),
  CCAA = as.factor(CCAA)
)

# Check results 2
281 282
# table(barometros$CCAA)
# barplot ( table(barometros$CCAA), horiz = T, las = 1 )
283 284


285
# load coding of problems from estudio 3164 that only shows numbers as the answers
286
problems_code <-  read.delim("data/original/estudio3164_codificacion_3problemas.csv",sep = ",") 
287

288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306
for (i in 1:nrow(barometros) ) {
  # if the problem is in the list of coded numbers replace it
  if( barometros$p1[i] %in% problems_code$id) { 
    # if 
    barometros$p1[i] <- problems_code[ problems_code$id == barometros$p1[i], ]$problema
    
    }
  if( barometros$p2[i] %in% problems_code$id) { 
    # if 
    barometros$p2[i] <- problems_code[ problems_code$id == barometros$p2[i], ]$problema
    
  }
  if( barometros$p3[i] %in% problems_code$id) { 
    # if 
    barometros$p3[i] <- problems_code[ problems_code$id == barometros$p3[i], ]$problema
    
  }
}
# TODO: revisar por qué sigue habiendo respuestas numéricas: puede que 96 y 97 correpondan con 98 y 99 de los códigos de los problemas?
307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351
# (hay 1.023 con 99 y una con 98)


# Respuestas con O ----------------
# un estudio (y otros) como el 2960, donde hay respuestas como "0" en las variables P702 y P703
# Según email del CIS "Suelen ser No respuesta, es decir, personas que responden solo 1 o 2 respuestas".

barometros <- barometros %>% mutate(
  p2 = ifelse( p2 == "0", "N.C.",p2),
  p3 = ifelse( p3 == "0", "N.C.",p3),
)

# recodify NA (no answers) in p1, P2 and P· as N.C.
barometros <- barometros %>% mutate(
  p1 = ifelse(is.na(p1), "N.C.",p1),
  p2 = ifelse(is.na(p2), "N.C.",p2),
  p3 = ifelse(is.na(p3), "N.C.",p3)
)
# table(barometros$p3) %>% head()



write.csv(barometros, file = "data/output/barometros-microdatos-ccaa-3problems_dirty.csv", row.names = FALSE)

# 6. process and clean files -----
#  in OpenRefine with the extract history at analysis/openrefine/process-3-problems.json
# Pero ojo, he pasado por las 3 columnas p1, p2 y p3, no como está el script que solamente tiene columna "p"

# 7. Load clean data
barometros <- read.delim("data/output/barometros-microdatos-ccaa-3problems_clean.csv",sep = ",") %>% mutate(
  date = as.Date(date),
  CCAA = as.factor(CCAA),
  PROV = as.factor(PROV),
  MUN = as.factor(MUN),
  # p1 = as.factor(p1),
  # p2 = as.factor(p2),
  # p3 = as.factor(p3)
  p1 = ifelse( p1 == "N.C." | p1 == "N.S.", "NS/NC", p1),
  p2 = ifelse( p2 == "N.C." | p2 == "N.S.", "NS/NC", p2),
  p3 = ifelse( p3 == "N.C." | p3 == "N.S.", "NS/NC", p3),
  # algunas p1 estaban vacías de respuestas
  # les asigno NS/NC (no sabe no contesta) para facilitar el cálculo
  p1 = ifelse( is.na(p1), "NS/NC",p1)
 
)
352

353
# Group by date and CCAA ----------------------
354
evol_count <- barometros %>% group_by(CCAA,date) %>% summarise(
355
  # counts number of elements by barometro and CCAA
356 357 358
  count_total = n()
) %>% ungroup()

359 360 361 362 363 364
evol_count_nsnc <- barometros %>% group_by(CCAA,date) %>% filter(p1 == "NS/NC" & p2 == "NS/NC" & p3 == "NS/NC") %>% summarise(
  # counts number of elements by barometro and CCAA
  count_nsnc = n()
) %>% ungroup()

evol_p1 <- barometros %>% group_by(CCAA,ESTU,date,p1) %>% summarise(
365 366
  # counts number of answers for each type for question 1 by barometro and CCAA
  count_p1 = as.numeric( n() )
367
  )
368
evol_p2 <- barometros %>% group_by(CCAA,ESTU,date,p2) %>% summarise(
369 370 371
  # counts number of answers for each type for question 1 by barometro and CCAA
  count_p2 = as.numeric( n() )
)
372
evol_p3 <- barometros %>% group_by(CCAA,ESTU,date,p3) %>% summarise(
373 374 375
  # counts number of answers for each type for question 1 by barometro and CCAA
  count_p3 = as.numeric( n() )
)
376

377 378 379
# joins p1 and p2
evol <- full_join(
  evol_p1 %>% mutate(dunique = paste0(date,CCAA,p1)) ,
380
  evol_p2 %>% mutate(dunique = paste0(date,CCAA,p2)) %>% ungroup() %>% rename( date_p2 = date, CCAA_p2 = CCAA, ESTU_p2 = ESTU),
381 382 383 384 385 386 387 388 389 390 391 392 393
  by = "dunique"
) %>% mutate (
  # perc_p2 = round( count_p2 / count_total * 100, digits = 1)
) 

# fills the dates and CCAA that were empty
evol <- evol %>% mutate(
  date = as.character(date),
  date = ifelse( is.na(date) , as.character(date_p2), date),
  date = as.Date(date),
  
  CCAA = as.character(CCAA),
  CCAA = ifelse( is.na(CCAA), as.character(CCAA_p2), CCAA),
394 395 396 397 398
  CCAA = as.factor(CCAA),
  
  ESTU = as.character(ESTU),
  ESTU = ifelse( is.na(ESTU), as.character(ESTU_p2), ESTU),
  ESTU = as.factor(ESTU)
399 400 401 402 403
)

# joins p1-p2 with p3
evol <- full_join(
  evol,
404
  evol_p3 %>% mutate(dunique = paste0(date,CCAA,p3)) %>% ungroup() %>% rename( date_p3 = date, CCAA_p3 = CCAA, ESTU_p3 = ESTU),
405 406 407 408 409 410 411 412 413 414 415
  by = "dunique"
) 

# fills the dates and CCAA that were empty
evol <- evol %>% mutate(
  date = as.character(date),
  date = ifelse( is.na(date) , as.character(date_p3), date),
  date = as.Date(date),
  
  CCAA = as.character(CCAA),
  CCAA = ifelse( is.na(CCAA), as.character(CCAA_p3), CCAA),
416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433
  CCAA = as.factor(CCAA),
  
  ESTU = as.character(ESTU),
  ESTU = ifelse( is.na(ESTU), as.character(ESTU_p3), ESTU),
  ESTU = as.factor(ESTU)
) %>% select ( date, CCAA, everything(), -dunique, -date_p2, -date_p3, -CCAA_p2, -CCAA_p3, -ESTU_p2, -ESTU_p3 )


evol <- rbind(
  evol,
  evol_count_nsnc %>% mutate(
    p1 = NA,
    count_p1 = NA,
    p2 = NA,
    count_p2 = NA,
    p3 = NA,
    count_p3 = NA
  ) %>% select ( date, CCAA, everything() )
434 435 436
)


437

438
# add number of answers per barometer and CCAA
439 440
evol <- left_join(
  evol %>% mutate(dunique = paste0(date,CCAA)),
441
  evol_count %>% mutate(dunique = paste0(date,CCAA)) %>% select(-date,-CCAA),
442
  by = "dunique"
443
) %>% mutate (
444
  count_p = replace_na(count_p1,0) + replace_na(count_p2,0) +  replace_na(count_p3,0),
445 446 447
  perc =  round( replace_na(count_p,0) / count_total * 100, digits = 1)
) %>% select ( date, CCAA, everything(), -dunique ) %>% mutate(
  # rellena p si está vacío. primero con p1, si está vacío p, rellena con p1, y si está vacío con p3
448 449 450
  p = p1,
  p = ifelse( is.na(p),p2,p),
  p = ifelse( is.na(p),p3,p),
451 452 453 454 455 456
  # si tiene datos NS/NC introducevalor en p
  p = ifelse( !is.na(count_nsnc), "NS/NC consolidated", p ),
  # Ojo porque "NS/NC consolidated" corresponde cuando las 3 respuestas son NS/NC
  # recalcula percentaje para esos casos especiales
  perc = ifelse( !is.na(count_nsnc), round( replace_na(count_nsnc,0) / count_total * 100, digits = 1), perc),
  count_p = ifelse( !is.na(count_nsnc), count_nsnc, count_p)
457
)
458

459 460 461 462 463 464 465
evol_p <- evol %>% group_by(CCAA,date,p) %>% summarise(
  # counts number of answers for each type for question 1 by barometro and CCAA
  count_p = as.numeric( n() )
)

# Remove problematic rows that count NS/NC
evol <- evol %>% filter( p != "NS/NC")
466

467 468
# Save file with the summary per date, CCAA and answers.
write.csv(evol, file = "data/output/barometro-ccaa-3problems.csv", row.names = FALSE)
469

470 471 472
# process and clean files -----
# No I do this in an earlier stage!
#  in OpenRefine with the extract history at analysis/openrefine/process-3-problems.json
473

474 475

# load cleaned data -----
476 477 478
# evol <- read.delim("data/output/barometro-ccaa-3problems_clean.csv",sep = ",") %>% mutate(
#   date = as.Date(date)
# )
479

480 481 482 483 484 485 486
# subsitute changes in p1, p2 and p3 variables from the p cleaned variable
# No sé si es necesario!?!
# evol <- evol %>% mutate (
#   p1 = p,
#   p2 = p,
#   p3 = p
# )
487

488 489

# answers per Spain
490 491 492 493
pes <- evol %>% group_by(p) %>% summarise(
  count = n()
) %>% arrange( desc(count) )

494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524
evol_spain <- evol %>% group_by(date,p) %>% summarise(
  count_p = sum(count_p),
  count_total = sum(count_total),
  perc = round( count_p / count_total * 100 , digits = 1)
  
)

# evolución NS/NC respuestas en el tiempo
evol_spain %>% filter( p == "NS/NC consolidated") %>% mutate( 
) %>% # CCAA == "Madrid" &  date > as.Date("2017-01-01"
  ggplot() + 
  geom_line( aes( date, perc, group=p, color = p) ) +
  # facet_wrap(~CCAA) +
  scale_x_date(
    date_breaks = "5 years",
    date_labels = "%Y") + 
  expand_limits(y = 0)+
  theme(
    #       panel.grid.minor.x = element_blank(),
    #       panel.grid.major.x = element_blank(),
    #       # panel.grid.minor.y = element_blank(),
    #       axis.ticks.x = element_line(color = "#777777"),
    legend.position =  "top"
  )

# nuber of questions per barometro per CCAA-----
xx <- evol %>% select( date, CCAA, count_total) %>% distinct() %>% group_by(CCAA) %>% filter ( date < as.Date( "2017-12-01")) %>% summarise(
  media = round( mean(count_total), digits = 1),
  mediana = median(count_total)
)
write.csv(xx, file = "data/output/barometro-ccaa-3problems_n-answers.csv", row.names = FALSE)
525 526


527
# PLOTs --------------
528
evol %>% mutate( 
529
) %>% # CCAA == "Madrid" &  date > as.Date("2017-01-01"
530 531
  ggplot() + 
  geom_line( aes( date, perc, group=p, color = p) ) +
532 533 534
  scale_y_continuous(
    breaks = c(0,25,50,75,100)
  ) + 
535 536 537
  facet_wrap(~CCAA) +
  scale_x_date(
    date_breaks = "5 years",
538 539 540 541 542 543 544 545
    date_labels = "%Y") + 
  theme(
    #       panel.grid.minor.x = element_blank(),
    #       panel.grid.major.x = element_blank(),
    #       # panel.grid.minor.y = element_blank(),
    #       axis.ticks.x = element_line(color = "#777777"),
          legend.position =  "none"
        )
546

547

548
evol %>% mutate( 
549
  ) %>% filter(p == "NS/NC consolidated") %>% ungroup() %>% # CCAA == "Madrid" &  date > as.Date("2017-01-01"
550
  ggplot() + 
551 552
  geom_line( aes( date, perc, group=p, color = p) ) +
  facet_wrap(~CCAA) +
553 554 555 556 557
  scale_x_date(
    date_breaks = "5 years",
    date_labels = "%Y")

evol %>% mutate( 
558
) %>% filter( p == "La corrupción y el fraude" | p == "El paro" | p == "NS/NC consolidated") %>% ungroup() %>% # CCAA == "Madrid" &  date > as.Date("2017-01-01"
559 560 561 562 563 564 565 566 567
  ggplot() + 
  geom_line( aes( date, count_p, group=p, color = p) ) +
  # geom_line( aes( date, count_total, group=p) ) +
  # geom_point( aes( date, count_total) ) +
  facet_wrap(~CCAA) +
  scale_x_date(
    date_breaks = "5 years",
    date_labels = "%Y")

568

569 570 571 572 573 574 575 576 577 578

evol %>% select( date, CCAA, count_total) %>% distinct() %>% filter ( date < as.Date( "2019-12-01")) %>% ggplot() + 
  geom_line( aes( date, count_total, group=CCAA, color=CCAA))  + #, color = "orange" 
  geom_text( data = evol %>% filter ( date < as.Date( "2019-12-01")) %>% group_by(CCAA) %>% top_n(1, date), aes(date,count_total, label=CCAA, color=CCAA), hjust =0) +
  scale_x_date(
    date_breaks = "5 years",
    date_labels = "%Y",
    limits = c( min(evol$date), max(evol$date) + 100 )
    )

579

580 581 582 583 584 585 586 587 588 589 590 591 592
# Cómo se reparte la posción de una respuesta entre los 3 problemas 1º, 2º y 3º?
evol %>% mutate( 
) %>% filter( p == "La corrupción y el fraude" & CCAA == "Madrid" & date > as.Date("2013-01-01")) %>% ungroup() %>% # CCAA == "Madrid" &  date > as.Date("2017-01-01"
  ggplot() + 
  geom_ribbon( aes( date, ymin=0, ymax = replace_na(count_p1,0) / count_p * 100), fill = "#000000" ) +
  geom_ribbon( aes( date, ymin= replace_na(count_p1,0) / count_p * 100, ymax = (replace_na(count_p1,0) / count_p * 100) + (replace_na(count_p2,0) / count_p * 100) ), fill = "#666666" ) +
  geom_ribbon( aes( date,
                    ymin= (replace_na(count_p1,0) / count_p * 100) + (count_p2 / count_p * 100),
                    ymax = (count_p1 / count_p * 100) + (count_p2 / count_p * 100) + (count_p3 / count_p * 100) ), fill = "#AAAAAA" ) +
  facet_wrap(~CCAA) +
  scale_x_date(
    date_breaks = "1 years",
    date_labels = "%Y")
593

594 595 596 597 598 599 600 601 602 603 604 605 606 607 608
evol %>% mutate( 
# ) %>% filter( p == "La corrupción y el fraude" & CCAA == "Madrid" ) %>% ungroup() %>% # CCAA == "Madrid" &  date > as.Date("2017-01-01"
) %>% filter( p == "El paro" & CCAA == "Madrid" ) %>% ungroup() %>% # CCAA == "Madrid" &  date > as.Date("2017-01-01"
  ggplot() + 
  geom_ribbon( aes( date, ymin=0, 
                    ymax = replace_na(count_p1,0) / count_total * 100), fill = "#000000" ) +
  geom_ribbon( aes( date, ymin= replace_na(count_p1,0) / count_total * 100, 
                    ymax = (replace_na(count_p1,0) / count_total * 100) + (replace_na(count_p2,0) / count_total * 100) ), fill = "#666666" ) +
  geom_ribbon( aes( date,
                    ymin= (replace_na(count_p1,0) / count_total * 100) + (count_p2 / count_total * 100),
                    ymax = (count_p1 / count_total * 100) + (count_p2 / count_total * 100) + (count_p3 / count_total * 100) ), fill = "#AAAAAA" ) +
  facet_wrap(~CCAA) +
  scale_x_date(
    date_breaks = "1 years",
    date_labels = "%Y")
609 610


611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626
# para buscar respeustas que contienen tal o cual palabra
# al ver que hay varias respuestas no limpiadas para terrorismo
evol %>% mutate( 
) %>% filter( str_detect(str_to_lower(p), "terrorismo")  &  CCAA == "Madrid" ) %>% ungroup() %>% # CCAA == "Madrid" &  date > as.Date("2017-01-01"
  ggplot() + 
  geom_ribbon( aes( date, ymin=0, 
                    ymax = replace_na(count_p1,0) / count_total * 100), fill = "#000000" ) +
  geom_ribbon( aes( date, ymin= replace_na(count_p1,0) / count_total * 100, 
                    ymax = (replace_na(count_p1,0) / count_total * 100) + (replace_na(count_p2,0) / count_total * 100) ), fill = "#666666" ) +
  geom_ribbon( aes( date,
                    ymin= (replace_na(count_p1,0) / count_total * 100) + (count_p2 / count_total * 100),
                    ymax = (count_p1 / count_total * 100) + (count_p2 / count_total * 100) + (count_p3 / count_total * 100) ), fill = "#AAAAAA" ) +
  facet_wrap(~CCAA) +
  scale_x_date(
    date_breaks = "1 years",
    date_labels = "%Y")