Commit 8fc3f4d0 authored by numeroteca's avatar numeroteca
Browse files

clean url from parameters, and unify elapis.com subdomains: for headline url comparison

parent 48f3a976
......@@ -288,6 +288,9 @@ simplify_classyfy_domains <- function(urls_total) {
domain = ifelse( domain== "blogs.elconfidencial.com", "elconfidencial.com",domain ),
domain = ifelse( domain== "vanitatis.elconfidencial.com", "elconfidencial.com",domain ),
domain = ifelse( domain== "politica.elpais.com", "elpais.com",domain ),
domain = ifelse( domain== "verne.elpais.com", "elpais.com",domain ),
domain = ifelse( domain== "brasil.elpais.com", "elpais.com",domain ),
domain = ifelse( domain== "cincodias.elpais.com", "elpais.com",domain ),
domain = ifelse( domain== "ver.20m.es", "20minutos.es",domain ),
domain = ifelse( domain== "20m.es", "20minutos.es",domain ),
domain = ifelse( domain== "m.20minutos.es", "20minutos.es",domain ),
......@@ -310,8 +313,6 @@ simplify_classyfy_domains <- function(urls_total) {
domain = ifelse( domain== "heral.do", "heraldo.es",domain ),
domain = ifelse( domain== "ecodiario.eleconomista.es", "eleconomista.es",domain ),
domain = ifelse( domain== "informalia.eleconomista.es", "eleconomista.es",domain ),
domain = ifelse( domain== "verne.elpais.com", "elpais.com",domain ),
domain = ifelse( domain== "cincodias.elpais.com", "elpais.com",domain ),
domain = ifelse( domain== "mediterraneo.diario16.com", "diario16.com",domain ),
domain = ifelse( domain== "lavoz.gal", "lavozdegalicia.es",domain ),
domain = ifelse( domain== "ww.cope.es", "cope.es",domain ),
......@@ -427,10 +428,45 @@ urls_total <- urls_total %>% mutate(
urls_total <- simplify_classyfy_domains(urls_total)
# Clean and unify URL ----------------------
# we need to unify the URL that point to the same content.
# sudomains
# https://brasil.elpais.com/brasil/2018/04/25/internacional/1524643078_623889.html
# https://elpais.com/politica/2018/04/25/actualidad/1524643078_623889.html
# or use paramters
# https://elpais.com/politica/2018/04/25/actualidad/1524643078_623889.html?id_externo_rsoc=FB_CM
# https://elpais.com/politica/2018/04/25/actualidad/1524643078_623889.html?id_externo_rsoc=FB_CM_ESP&id_externo_rsoc=TW_CM_ESP
# https://elpais.com/politica/2018/04/25/actualidad/1524643078_623889.html?id_externo_rsoc=TW_CM_ESP
# https://elpais.com/politica/2018/04/25/actualidad/1524643078_623889.html?utm_source=dlvr.it&utm_medium=twitter#?ref=rss&format=simple&link=link
# https://elpais.com/politica/2018/04/25/actualidad/1524643078_623889.html#?ref=rss&format=simple&link=link
# gsub("\\?.*","","https://elpais.com/politica/2018/04/28/actualidad/1524937871_495640.html#?ref=rss&format=simple&link=link")
# gsub("#.*","","https://elpais.com/politica/2018/04/28/actualidad/1524937871_495640.html#?ref=rss&format=simple&link=link")
# Remove parameters
urls_total <- urls_total %>% mutate(
# Remove text after # and ? for news_media
# TODO: can e done for twitter and blogs. Don't do it for BOE or Youtube!!
url = ifelse( domain_type == "news_media", gsub("#.*","",url), url),
url = ifelse( domain_type == "news_media", gsub("\\?.*","",url), url)
)
# unify subdomains
# gsub("brasil.elpais.com","elpais.com","https://brasil.elpais.com/brasil/2018/04/25/internacional/1524643078_623889.html")
urls_total <- urls_total %>% mutate(
# Remove subdomains
# TODO: for other news media
url = gsub("politica.elpais.com","elpais.com",url),
url = gsub("brasil.elpais.com","elpais.com",url),
url = gsub("verne.elpais.com","elpais.com",url),
url = gsub("cincodias.elpais.com","elpais.com",url)
)
# Save data -----------
saveRDS(urls_total, file = paste0("data/output/",case_path ,"/",case_path,"_",period_path,"_urls_total.rds"))
# counts by domain type
domain_type <- urls_total %>% filter( domain != "0" & !is.na(domain) ) %>% group_by(domain_type) %>% summarise(
count = n()
) %>% arrange(-count)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment