...
 
Commits (9)
......@@ -6,6 +6,7 @@ library("raster")
library("sp")
# restart R .rs.restartR() if rgdal does not work
library(rgdal)
library(tidyverse)
# check ogr drivers to see if GeoJSON is loaded:
# ogrDrivers()
......@@ -20,16 +21,24 @@ distritos <- readOGR("data/original/shapes/distritos.valencia.wgs84.geojson")
# proj4string(menores) # check CRS
# points
airbnb <- read_csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded.csv")
summary(airbnb)
# airbnb <- read_csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded_barrio-distrito.csv")
# summary(airbnb)
# airbnb <- airbnb %>% filter(!is.na(barrio))
# airbnb <- read_csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar_geocoded-photon2_por-geocodificar.csv")
# airbnb <- read_csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar_geocoded-photon2.csv")
airbnb <- read_csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded-photon3.csv")
airbnb$id.row <- rownames(airbnb)
## Get long and lat from your data.frame. Make sure that the order is in lon/lat.
# source: https://stackoverflow.com/questions/29736577/how-to-convert-data-frame-to-spatial-coordinates#29736844
xy <- airbnb[,c("lon","lat")]
# removes NA values
airbnb <- airbnb[!is.na(airbnb$lon),]
airbnb.temp <- airbnb[!is.na(airbnb$lon),]
xy <- xy[!is.na(xy$lon),]
airbnbSp <- SpatialPointsDataFrame(coords = xy, data = airbnb, proj4string = CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"))
# creates spatial points data frame
airbnbSp <- SpatialPointsDataFrame(coords = xy, data = airbnb.temp, proj4string = CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"))
# class(airbnbSp)
# proj4string(airbnbSp)
......@@ -52,21 +61,49 @@ airbnbSp$coddistrit <- countDistritos$coddistrit
# Where are those points without barrio
library(ggmap)
qmplot(longitude, latitude, data = airbnb[is.na(airbnbSp$barrio =="no location"),], maptype = "toner-lite",
qmplot(lon, lat, data = airbnb[is.na(airbnbSp$barrio =="no location"),], maptype = "toner-lite",
color = I("red"),alpha = I(.2)) + labs(title= "Points without barrio" )
qmplot(longitude, latitude, data = airbnb, maptype = "toner-lite",
qmplot(lon, lat, data = airbnb, maptype = "toner-lite",
color = I("red"),alpha = I(.2)) + labs(title= "Points without barrio" )
airbnb <- as.data.frame(airbnbSp) # convert spatial data to regular data frame
airbnb.temp <- as.data.frame(airbnbSp) # convert spatial data to regular data frame
# removes duplicated columns with lat and long
drops <- c("lat.1","lon.1")
airbnb <- airbnb[ , !(names(airbnb) %in% drops)]
airbnb.temp <- airbnb.temp[ , !(names(airbnb.temp) %in% drops)]
airbnb <- left_join(airbnb,select(airbnb.temp,Signatura,barrio,distrito,coddistbar,coddistrit,id.row),by="id.row")
# Some points will be outside polygons and have Barrio variable fixed
airbnb[is.na(airbnbSp$barrio),]$name
# There are n points that have no Barrio assigned
length(airbnb[is.na(airbnbSp$barrio),]$name)
# table(airbnb$barrio)
# table(airbnb$distrito)
ggplot() +
geom_polygon(data = municipios,
aes(x = long, y = lat, group = group),
color = "grey", fill="white", size = 0.1) +
geom_polygon(data = barrios,
aes(x = long, y = lat, group = group),
color = "grey", fill="white", size = 0.5) +
geom_point(data= airbnb,
aes(x=lon, y=lat),
color=distrito,alpha=0.9,size = 0.1) +
# coord_fixed(xlim= c(-0.5, -0.2),ylim=c(39.0,39.75),ratio=1.3 )
coord_fixed(xlim= c(-0.49, -0.3),ylim=c(39.40,39.65),ratio=1.3 ) +
# theme_nothing(legend = TRUE) +
theme_minimal(base_family = "Roboto Condensed", base_size = 12) +
theme(
panel.grid = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
panel.background = element_rect(fill="#EEEEFF",color = "grey",size = 0.25),
legend.position = "top"
) +
labs(title=paste("VUT en Valencia", sep = "")) +
guides(colour = guide_legend(override.aes = list(size=3)))
# library(dplyr)
# airbnb$barrio <- lapply(airbnb$barrio, as.character)
......@@ -83,4 +120,4 @@ length(airbnb[is.na(airbnbSp$barrio),]$name)
# write.csv(airbnb, file = "data/output/vut-donostia/censo-viviendas-turisticas-donostia-180301_barrio-umenor.csv", row.names = FALSE)
# write.csv(airbnb, file = "data/output/vut-donostia/censo-viviendas-turisticas-donostia-20180914_barrio-umenor.csv", row.names = FALSE)
# write.csv(airbnb, file = "data/output/180604_listings-airbnb-donostia_datahippo_with-last-review-20180912-reviewed_barrio-umenor.csv", row.names = FALSE)
write.csv(airbnb, file = "data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded_barrio-distrito.csv", row.names = FALSE)
write.csv(airbnb, file = "data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded-photon3_barrio-distrito.csv", row.names = FALSE)
# script para analizar las viviendas turísticas de la comunidad Valenciana
# Load libraries
# Load libraries ----------------------------
library(tidyverse)
# for maps and theme nothing
library(ggmap)
......@@ -10,12 +10,12 @@ library(gsubfn)
# load data ------
vut_valenciana <- read.csv("data/original/190302_viviendas-turisticas-comunidad-valenciana.csv",stringsAsFactors = FALSE)
vut_valencia <- read.csv("data/original/190302_viviendas-turisticas-comunidad-valenciana_valencia.csv",stringsAsFactors = FALSE)
vut_valencia <- read.csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded-photon3_barrio-distrito.csv",stringsAsFactors = FALSE)
vut <- read_csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded_barrio-distrito.csv")
# vut <- read_csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded_barrio-distrito.csv")
export.to.geocode <- vut %>% filter(is.na(barrio))
write.csv(airbnb, file = "data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar.csv", row.names = FALSE)
# export.to.geocode <- vut %>% filter(is.na(barrio))
# write.csv(export.to.geocode, file = "data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar.csv", row.names = FALSE)
# shapes
barrios <- readOGR("data/original/shapes/barrios.valencia.wgs84.geojson")
......@@ -23,31 +23,35 @@ distritos <- readOGR("data/original/shapes/distritos.valencia.wgs84.geojson")
# municipios <- distritos
municipios <- readOGR("data/original/shapes/municipios.provincia.valencia.geojson")
# Analisis comunidad valenciana -------------
# Analisis comunidad valenciana por municipios -------------
vut.municipio <- group_by(vut_valenciana,Municipio) %>% summarise( n= n() ) %>% arrange(desc(n))
png(filename="images/vut/vut-municipios-top25-comunidad-valenciana-201903.png",width = 600,height = 500)
vut.municipio %>% head(25) %>%
ggplot(aes(x = reorder(Municipio,n), y = n)) +
scale_y_continuous(labels=function(x) format(x, big.mark = ".", scientific = FALSE)) +
geom_col() + coord_flip() +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
theme(
panel.grid.minor.y = element_blank(), panel.grid.major.y = element_blank(),
legend.position = "bottom"
) +
labs(title = "Número de viviendas turísticas por municipo: top 25",
subtitle = "Valencia. Marzo 2019.",
y = "nº anuncios",
x = "tlf",
caption = "Datos: Comunidad Valenciana. Gráfico: lab.montera34.com/airbnb")
labs(title = "Viviendas turísticas en registro oficial por municipo: top 25",
subtitle = "Comunidad Valenciana. Marzo 2019.",
y = "Número de VUT",
x = "",
caption = "Datos: Generalitat Valenciana. Gráfico: lab.montera34.com/airbnb") +
geom_text(aes(label = format(n, big.mark = ".", scientific = FALSE), x=reorder(Municipio,n), y = n+50),
position = "dodge",
size=3,color="#888888", hjust=0)
dev.off()
# analisis Valencia -----------------
# names(vut) <- c("signatura","municipio","provincia","addres","tlf","lat","lon")
ggplot(data=vut)+
geom_bar(stat='identity', aes(x = signatura, y = tlf))
ntlf<- group_by(vut,Teléfono) %>% summarise( n= n() ) %>% arrange(desc(n))
# por teléfono --------------
ntlf<- group_by(vut_valencia,Teléfono) %>% summarise( n= n() ) %>% arrange(desc(n))
png(filename="images/vut/vut-telefonos-top25-valencia-201903.png",width = 600,height = 500)
ntlf[!is.na(ntlf$Teléfono),] %>% head(25) %>%
ggplot(aes(x = reorder(Teléfono,n), y = n)) +
geom_col() + coord_flip() +
......@@ -59,8 +63,12 @@ theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
labs(title = "Número de viviendas turísticas por teléfono: top 25",
subtitle = "Valencia. Marzo 2019.",
y = "nº anuncios",
x = "tlf",
caption = "Datos: Comunidad Valenciana. Gráfico: lab.montera34.com/airbnb")
x = "",
caption = "Datos: Comunidad Valenciana. Gráfico: lab.montera34.com/airbnb") +
geom_text(aes(label = format(n, big.mark = ".", scientific = FALSE), x=reorder(Teléfono,n), y = n+2),
position = "dodge",
size=3,color="#888888", hjust=0)
dev.off()
# select(vut,Teléfono=="963356793")
#
......@@ -69,7 +77,7 @@ theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
#
# tlf=="963356793"
vut$Teléfono <- as.factor(vut$Teléfono)
# vut$Teléfono <- as.factor(vut$Teléfono)
# extract VUT ID --------
# vut_valencia$registro.number <- str_extract(vut_valencia$Signatura,"[:punctuation:]?[:blank:]?-\\d{5}")
......@@ -135,8 +143,11 @@ ggplot() +
guides(colour = guide_legend(override.aes = list(size=3)))
# por distrito en Valencia------------
vut.distrito <- group_by(vut,distrito) %>% summarise( n= n() ) %>% arrange(desc(n))
vut.distrito <- group_by(vut_valencia,distrito) %>% summarise( n= n(), na ) %>% arrange(desc(n)) %>% ungroup()
vut.distrito[is.na(vut.distrito$distrito),]$distrito <- "POR CLASIFICAR"
png(filename="images/vut/vut-distritos-valencia-201903.png",width = 600,height = 500)
vut.distrito %>% #filter(!is.na(distrito)) %>%
ggplot(aes(x = reorder(distrito,n), y = n)) +
geom_col()+
......@@ -145,29 +156,39 @@ ggplot(aes(x = reorder(distrito,n), y = n)) +
hjust = 0,
size=3,color="#000000") +
coord_flip() +
theme_minimal(base_family = "Roboto Condensed", base_size = 10) +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
theme(
panel.grid.minor.y = element_blank(), panel.grid.major.y = element_blank(),
legend.position = "bottom"
) +
labs(title = "Número de viviendas turísticas por distrito",
labs(title = "Número de viviendas turísticas por distrito en registro oficial",
subtitle = "Valencia. Marzo 2019.",
y = "nº anuncios",
x = "tlf",
x = "",
caption = "Datos: Comunidad Valenciana. Gráfico: lab.montera34.com/airbnb")
dev.off()
# por barrio
# por barrio -----------------------------
vut.barrio <- group_by(vut,barrio) %>% summarise( n= n() ) %>% arrange(desc(n))
vut.barrio %>% ggplot(aes(x = reorder(barrio,n), y = n)) +
vut.barrio[is.na(vut.barrio$barrio),]$barrio <- "POR CLASIFICAR"
png(filename="images/vut/vut-barrios-valencia-201903.png",width = 600,height = 1100)
vut.barrio %>% filter(!barrio == "POR CLASIFICAR") %>%
ggplot(aes(x = reorder(barrio,n), y = n)) +
geom_col() + coord_flip() +
theme_minimal(base_family = "Roboto Condensed", base_size = 10) +
geom_text(data = vut.barrio %>% filter(!barrio == "POR CLASIFICAR"),
aes(label = n,y = n+1),
hjust = 0,
size=3,color="#000000") +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
theme(
panel.grid.minor.y = element_blank(), panel.grid.major.y = element_blank(),
legend.position = "bottom"
) +
labs(title = "Número de viviendas turísticas por barrio",
subtitle = "Valencia. Marzo 2019.",
labs(title = "Número de viviendas turísticas por barrioen registro oficial",
subtitle = "Valencia. Marzo 2019. (1.202 por clasificar).",
y = "nº anuncios",
x = "tlf",
x = "",
caption = "Datos: Comunidad Valenciana. Gráfico: lab.montera34.com/airbnb")
dev.off()
......@@ -9,23 +9,45 @@ library(tidyverse)
# Load Valencia's official street names from:
# http://gobiernoabierto.valencia.es/va/dataset/?id=listado-de-calles
# Creates variables to store type of highway in Spanish and Catalan
calles_valencia = read.csv("data/original/vias-valencia.csv") %>%
mutate(codtipovia = sub("C/", "C", codtipovia)) %>%
mutate(tipovia_es = codtipovia) %>%
mutate(tipovia_es = sub("PG", "PA", tipovia_es)) %>%
mutate(tipovia_es = sub("AV", "Avenida", tipovia_es)) %>%
mutate(tipovia_es = sub("C/", "Calle", tipovia_es)) %>%
mutate(tipovia_es = sub("GV", "Gran Vía", tipovia_es)) %>%
mutate(tipovia_es = sub("PG", "Paseo", tipovia_es)) %>%
mutate(tipovia_es = sub("PL", "Plaza", tipovia_es)) %>%
mutate(tipovia_es = sub("PTGE", "Pasaje", tipovia_es)) %>%
mutate(tipovia_ca = codtipovia) %>%
mutate(tipovia_ca = sub("C/", "Carrer de", tipovia_ca)) %>%
mutate(nombre_es = paste(tipovia_es, traducnooficial, sep = " ")) %>%
mutate(nombre_es = as.factor(nombre_es))
mutate(tipovia_ca = sub("AV", "Avinguda", tipovia_ca)) %>%
mutate(tipovia_ca = sub("C/", "Carrer", tipovia_ca)) %>%
mutate(tipovia_ca = sub("GV", "Gran Via", tipovia_ca)) %>%
mutate(tipovia_ca = sub("PG", "Passeig", tipovia_ca)) %>%
mutate(tipovia_ca = sub("PL", "Plaça", tipovia_ca)) %>%
mutate(tipovia_ca = sub("PTGE", "Passatge", tipovia_ca)) %>%
mutate(codtipovia = sub("C/", "C", codtipovia)) %>%
mutate(codtipovia = sub("CAMI", "CM", codtipovia)) %>%
mutate(codtipovia = sub("PG", "PA", codtipovia)) %>%
mutate(nombre_es = paste(codtipovia, traducnooficial, sep = " ")) %>%
mutate(nombre_es = as.factor(toupper(nombre_es))) %>%
mutate(nombre_es_full = paste(tipovia_es, traducnooficial, sep = " ")) %>%
mutate(nombre_es_full = as.factor(toupper(nombre_es_full))) %>%
mutate(nombre_ca_full = paste(tipovia_ca, nomoficial, sep = " ")) %>%
mutate(nombre_ca_full = as.factor(toupper(nombre_ca_full)))
# Load dataset and manipulate addresses.
# we only use street name, as there are no many street numbers in Valencia OSM data
# df = read.csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar.csv") %>%
df = read.csv("data/original/190302_viviendas-turisticas-comunidad-valenciana_valencia.csv") %>%
# df = read.csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar_geocoded-photon2_por-geocodificar.csv") %>%
# select(Signatura,Municipio,Provincia,Address,Teléfono) %>%
separate(Address, c("nombre_es_raw", "num"), extra = "merge",
sep = ", nº", remove = FALSE) %>%
separate(num, c("num", "puerta"), extra = "merge", sep = ", ", remove = FALSE) %>%
mutate(nombre_es_raw = as.factor(toupper(nombre_es_raw)))
# Basic Record linkage.
# Basic Record linkage
# Create function
record.linkage.names = function(names,
officialnames = calles_valencia$nombre_es) {
# Matches a street's name with the most similar official one.
......@@ -40,27 +62,75 @@ record.linkage.names = function(names,
return(names)
}
df = df %>%
mutate(nombre_es_raw = gsub("C LA REINA", "C REINA", nombre_es_raw)) %>%
mutate(nombre_es_raw = gsub("C DE LA REINA", "C REINA", nombre_es_raw)) %>%
mutate(nombre_es_raw = gsub("DR. WAKSMAN", "DOCTOR WAKSMAN", nombre_es_raw)) %>%
mutate(nombre_es_raw = gsub("DR MONSERRAT", "MONSERRAT", nombre_es_raw)) %>%
mutate(nombre_es_raw = gsub("GOS-GUSS", "GOS GUSS", nombre_es_raw)) %>%
mutate(nombre_es_raw = gsub("L'ALGUER", "ALGUER", nombre_es_raw)) %>%
mutate(nombre_es_raw = gsub("HIGINIO NOJA PROFESOR", "HIGINIO NOJA (PROFESOR)", nombre_es_raw)) %>%
mutate(nombre_es_raw = gsub("C MARIANO BENLLIURE", "PL MARIANO BENLLIURE", nombre_es_raw)) %>%
mutate(nombre_es_raw = gsub("LA SAFOR", "SAFOR", nombre_es_raw)) %>%
mutate(nombre_es_raw = gsub("DELS XIPRERS", "XIPRERS", nombre_es_raw)) %>%
mutate(nombre_es_raw = gsub("DR. J.J. DOMINE", "DOCTOR J.J. DOMINE", nombre_es_raw)) %>%
mutate(nombre_es_raw = gsub("FERNANDO ABRIL MARTO", "FERNANDO ABRIL MARTORELL", nombre_es_raw)) %>%
mutate(nombre_es = record.linkage.names(nombre_es_raw)) %>%
left_join(calles_valencia)
left_join(calles_valencia, by = c("nombre_es" = "nombre_es"))
# Geocoding with Photon ---------------------------------------------------
# Build a new dataframe with desired information.
df2 = df %>%
select(Signatura, Municipio, tipovia_ca, nomoficial, num) %>%
mutate(full_address_ca = paste(tipovia_ca, nomoficial, num, Municipio,
sep = ", "))
select(Address, Signatura, Municipio, tipovia_ca, nomoficial, num) %>%
mutate(full_address_ca = paste(tipovia_ca, nomoficial, Municipio, "Spain",
sep = " ")) %>%
filter(!is.na(nomoficial))
df <- df %>%
mutate(full_address_ca = paste(tipovia_ca, nomoficial, Municipio, "Spain",
sep = " "))
geocoded.df = photon::geocode(head(df2$full_address_ca), limit = 1,
geocoded.df = photon::geocode(unique(df2$full_address_ca), limit = 1,
# lang = "es",
key = "highway",
locbias = c(-0.3766, 39.4665))
# TODO explore if extending search to three makes it possible later to select the one solution in Valencia City
# geocoded.df3 = photon::geocode(head(df2$full_address_ca), limit = 3,
# # lang = "es",
# key = "highway",
# locbias = c(-0.3766, 39.4665))
# Check locations ----------
table(geocoded.df$country)
table(geocoded.df$city)
# plot results in a map
ggplot() +
# geom_polygon(data = municipios,
# aes(x = long, y = lat, group = group),
# color = "grey", fill="white", size = 0.1) +
geom_polygon(data = barrios,
aes(x = long, y = lat, group = group),
color = "grey", fill="white", size = 0.1) +
geom_point(data= geocoded.df,
aes(x=lon, y=lat),alpha=1,size = 0.1) +
geom_text(data= geocoded.df,
aes(x=lon, y=lat, label=name),alpha=1,size = 0.1)
# Combine geocoded dataframe with original one.---------------
# geocoded.df$row.id <- rownames(geocoded.df)
# df$row.id <- rownames(df)
# df.combined <- full_join(df, geocoded.df, by = "row.id")
# add coordinates of streets to VUT
df.combined <- left_join(df,geocoded.df, by = c("full_address_ca" = "location"))
# Combine geocoded dataframe with original one.
df.combined = df2 %>%
left_join(geocoded.df, by = c("full_address_ca" = "location"))
# Check locations ----------
table(df.combined$country)
table(df.combined$city)
nrow(df.combined %>% filter(city == "Valencia"))
nrow(df.combined %>% filter(!city == "Valencia"))
write.csv(df.combined, file = "data/output/filename.csv")
write.csv(df.combined, file = "data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded-photon3.csv", row.names = FALSE)