Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Sign in
Toggle navigation
Menu
Open sidebar
numeroteca
tuits-analysis
Commits
8fc3f4d0
Commit
8fc3f4d0
authored
Oct 17, 2021
by
numeroteca
Browse files
clean url from parameters, and unify elapis.com subdomains: for headline url comparison
parent
48f3a976
Changes
1
Show whitespace changes
Inline
Side-by-side
analysis/links-in-tweets.R
View file @
8fc3f4d0
...
...
@@ -288,6 +288,9 @@ simplify_classyfy_domains <- function(urls_total) {
domain
=
ifelse
(
domain
==
"blogs.elconfidencial.com"
,
"elconfidencial.com"
,
domain
),
domain
=
ifelse
(
domain
==
"vanitatis.elconfidencial.com"
,
"elconfidencial.com"
,
domain
),
domain
=
ifelse
(
domain
==
"politica.elpais.com"
,
"elpais.com"
,
domain
),
domain
=
ifelse
(
domain
==
"verne.elpais.com"
,
"elpais.com"
,
domain
),
domain
=
ifelse
(
domain
==
"brasil.elpais.com"
,
"elpais.com"
,
domain
),
domain
=
ifelse
(
domain
==
"cincodias.elpais.com"
,
"elpais.com"
,
domain
),
domain
=
ifelse
(
domain
==
"ver.20m.es"
,
"20minutos.es"
,
domain
),
domain
=
ifelse
(
domain
==
"20m.es"
,
"20minutos.es"
,
domain
),
domain
=
ifelse
(
domain
==
"m.20minutos.es"
,
"20minutos.es"
,
domain
),
...
...
@@ -310,8 +313,6 @@ simplify_classyfy_domains <- function(urls_total) {
domain
=
ifelse
(
domain
==
"heral.do"
,
"heraldo.es"
,
domain
),
domain
=
ifelse
(
domain
==
"ecodiario.eleconomista.es"
,
"eleconomista.es"
,
domain
),
domain
=
ifelse
(
domain
==
"informalia.eleconomista.es"
,
"eleconomista.es"
,
domain
),
domain
=
ifelse
(
domain
==
"verne.elpais.com"
,
"elpais.com"
,
domain
),
domain
=
ifelse
(
domain
==
"cincodias.elpais.com"
,
"elpais.com"
,
domain
),
domain
=
ifelse
(
domain
==
"mediterraneo.diario16.com"
,
"diario16.com"
,
domain
),
domain
=
ifelse
(
domain
==
"lavoz.gal"
,
"lavozdegalicia.es"
,
domain
),
domain
=
ifelse
(
domain
==
"ww.cope.es"
,
"cope.es"
,
domain
),
...
...
@@ -427,10 +428,45 @@ urls_total <- urls_total %>% mutate(
urls_total
<-
simplify_classyfy_domains
(
urls_total
)
# Clean and unify URL ----------------------
# we need to unify the URL that point to the same content.
# sudomains
# https://brasil.elpais.com/brasil/2018/04/25/internacional/1524643078_623889.html
# https://elpais.com/politica/2018/04/25/actualidad/1524643078_623889.html
# or use paramters
# https://elpais.com/politica/2018/04/25/actualidad/1524643078_623889.html?id_externo_rsoc=FB_CM
# https://elpais.com/politica/2018/04/25/actualidad/1524643078_623889.html?id_externo_rsoc=FB_CM_ESP&id_externo_rsoc=TW_CM_ESP
# https://elpais.com/politica/2018/04/25/actualidad/1524643078_623889.html?id_externo_rsoc=TW_CM_ESP
# https://elpais.com/politica/2018/04/25/actualidad/1524643078_623889.html?utm_source=dlvr.it&utm_medium=twitter#?ref=rss&format=simple&link=link
# https://elpais.com/politica/2018/04/25/actualidad/1524643078_623889.html#?ref=rss&format=simple&link=link
# gsub("\\?.*","","https://elpais.com/politica/2018/04/28/actualidad/1524937871_495640.html#?ref=rss&format=simple&link=link")
# gsub("#.*","","https://elpais.com/politica/2018/04/28/actualidad/1524937871_495640.html#?ref=rss&format=simple&link=link")
# Remove parameters
urls_total
<-
urls_total
%>%
mutate
(
# Remove text after # and ? for news_media
# TODO: can e done for twitter and blogs. Don't do it for BOE or Youtube!!
url
=
ifelse
(
domain_type
==
"news_media"
,
gsub
(
"#.*"
,
""
,
url
),
url
),
url
=
ifelse
(
domain_type
==
"news_media"
,
gsub
(
"\\?.*"
,
""
,
url
),
url
)
)
# unify subdomains
# gsub("brasil.elpais.com","elpais.com","https://brasil.elpais.com/brasil/2018/04/25/internacional/1524643078_623889.html")
urls_total
<-
urls_total
%>%
mutate
(
# Remove subdomains
# TODO: for other news media
url
=
gsub
(
"politica.elpais.com"
,
"elpais.com"
,
url
),
url
=
gsub
(
"brasil.elpais.com"
,
"elpais.com"
,
url
),
url
=
gsub
(
"verne.elpais.com"
,
"elpais.com"
,
url
),
url
=
gsub
(
"cincodias.elpais.com"
,
"elpais.com"
,
url
)
)
# Save data -----------
saveRDS
(
urls_total
,
file
=
paste0
(
"data/output/"
,
case_path
,
"/"
,
case_path
,
"_"
,
period_path
,
"_urls_total.rds"
))
# counts by domain type
domain_type
<-
urls_total
%>%
filter
(
domain
!=
"0"
&
!
is.na
(
domain
)
)
%>%
group_by
(
domain_type
)
%>%
summarise
(
count
=
n
()
)
%>%
arrange
(
-
count
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment