Commit 62a5c50f authored by numeroteca's avatar numeroteca

geocodes 1200+ wrongly php geocoded locations with photon script. Calculates...

geocodes 1200+ wrongly php geocoded locations with photon script. Calculates location barrio/district of found vut. Merges result with good geocoded locations and recalculates VUT by district and barrio in Valencia
parent 5f267803
......@@ -27,16 +27,22 @@ distritos <- readOGR("data/original/shapes/distritos.valencia.wgs84.geojson")
# airbnb <- airbnb %>% filter(!is.na(barrio))
# airbnb <- read_csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar_geocoded-photon2_por-geocodificar.csv")
# airbnb <- read_csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar_geocoded-photon2.csv")
airbnb <- read_csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded-photon3.csv")
# airbnb <- read_csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded-photon3.csv")
# airbnb <- read.delim("data/original/airbnb/190227/listings_valencia_insideairbnb.csv",sep = ",")
airbnb <- read.delim("data/output/1190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar_geocoded-photon.csv",sep = ",")
airbnb$id.row <- rownames(airbnb)
## Get long and lat from your data.frame. Make sure that the order is in lon/lat.
# source: https://stackoverflow.com/questions/29736577/how-to-convert-data-frame-to-spatial-coordinates#29736844
xy <- airbnb[,c("lon","lat")]
# xy <- airbnb[,c("longitude","latitude")]
# removes NA values
airbnb.temp <- airbnb[!is.na(airbnb$lon),]
# airbnb.temp <- airbnb[!is.na(airbnb$longitude),]
xy <- xy[!is.na(xy$lon),]
# xy <- xy[!is.na(xy$longitude),]
# creates spatial points data frame
airbnbSp <- SpatialPointsDataFrame(coords = xy, data = airbnb.temp, proj4string = CRS("+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"))
# class(airbnbSp)
......@@ -60,18 +66,21 @@ airbnbSp$coddistbar <- countBarrios$coddistbar
airbnbSp$coddistrit <- countDistritos$coddistrit
# Where are those points without barrio
library(ggmap)
qmplot(lon, lat, data = airbnb[is.na(airbnbSp$barrio =="no location"),], maptype = "toner-lite",
color = I("red"),alpha = I(.2)) + labs(title= "Points without barrio" )
qmplot(lon, lat, data = airbnb, maptype = "toner-lite",
color = I("red"),alpha = I(.2)) + labs(title= "Points without barrio" )
# library(ggmap)
# qmplot(lon, lat, data = airbnb[is.na(airbnbSp$barrio =="no location"),], maptype = "toner-lite",
# color = I("red"),alpha = I(.2)) + labs(title= "Points without barrio" )
# qmplot(lon, lat, data = airbnb, maptype = "toner-lite",
# color = I("red"),alpha = I(.2)) + labs(title= "Points without barrio" )
airbnb.temp <- as.data.frame(airbnbSp) # convert spatial data to regular data frame
# removes duplicated columns with lat and long
drops <- c("lat.1","lon.1")
# drops <- c("latitude.1","longitude.1")
airbnb.temp <- airbnb.temp[ , !(names(airbnb.temp) %in% drops)]
airbnb <- left_join(airbnb,select(airbnb.temp,Signatura,barrio,distrito,coddistbar,coddistrit,id.row),by="id.row")
# airbnb <- left_join(airbnb, select(airbnb.temp, barrio,distrito,coddistbar,coddistrit,id.row),by="id.row")
airbnb <- left_join(select(airbnb,-barrio,-distrito,-coddistbar,-coddistrit), # tiene la variable creada pero es erronea
select(airbnb.temp, barrio,distrito,coddistbar,coddistrit,id.row),by="id.row")
# Some points will be outside polygons and have Barrio variable fixed
airbnb[is.na(airbnbSp$barrio),]$name
......@@ -88,9 +97,10 @@ ggplot() +
geom_polygon(data = barrios,
aes(x = long, y = lat, group = group),
color = "grey", fill="white", size = 0.5) +
geom_point(data= airbnb,
aes(x=lon, y=lat),
color=distrito,alpha=0.9,size = 0.1) +
geom_point(data= filter(airbnb,distrito=="POBLATS MARITIMS"),
# aes(x=lon, y=lat,color=distrito),
aes(x=longitude, y=latitude),
alpha=0.9,size = 0.1) +
# coord_fixed(xlim= c(-0.5, -0.2),ylim=c(39.0,39.75),ratio=1.3 )
coord_fixed(xlim= c(-0.49, -0.3),ylim=c(39.40,39.65),ratio=1.3 ) +
# theme_nothing(legend = TRUE) +
......@@ -105,6 +115,8 @@ ggplot() +
labs(title=paste("VUT en Valencia", sep = "")) +
guides(colour = guide_legend(override.aes = list(size=3)))
# xx.barrios.dist <- select(airbnb, barrio,neighbourhood_cleansed,distrito,neighbourhood,neighbourhood_group_cleansed)
# library(dplyr)
# airbnb$barrio <- lapply(airbnb$barrio, as.character)
# airbnb$umenores <- lapply(airbnb$umenores, as.character)
......@@ -114,10 +126,15 @@ ggplot() +
# airbnb[is.na(airbnbSp$umenores),]$umenores <- "no location"
# airbnb[airbnb$umenores == "Sag�es" ,]$umenores <- "Sagües"
table(airbnb$distrito)
table(airbnb$barrio)
# saves file
# save(airbnb,file="data/output/180423_listings-airbnb-donostia_datahippo_barrio-umenor.Rda")
# write.csv(airbnb, file = "data/output/180423_listings-airbnb-donostia_datahippo_barrio-umenor.csv", row.names = FALSE)
# write.csv(airbnb, file = "data/output/vut-donostia/censo-viviendas-turisticas-donostia-180301_barrio-umenor.csv", row.names = FALSE)
# write.csv(airbnb, file = "data/output/vut-donostia/censo-viviendas-turisticas-donostia-20180914_barrio-umenor.csv", row.names = FALSE)
# write.csv(airbnb, file = "data/output/180604_listings-airbnb-donostia_datahippo_with-last-review-20180912-reviewed_barrio-umenor.csv", row.names = FALSE)
write.csv(airbnb, file = "data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded-photon3_barrio-distrito.csv", row.names = FALSE)
# write.csv(airbnb, file = "data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded-photon3_barrio-distrito.csv", row.names = FALSE)
# write.csv(airbnb, file = "data/output/airbnb/190227/listings_valencia_insideairbnb_barrio-distrito.csv", row.names = FALSE)
write.csv(airbnb, file = "data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar_geocoded-photon_barrio-distrito.csv", row.names = FALSE)
......@@ -9,8 +9,20 @@ library(rgdal)
library(gsubfn)
# load data ------
# comunidad Valenciana
vut_valenciana <- read.csv("data/original/190302_viviendas-turisticas-comunidad-valenciana.csv",stringsAsFactors = FALSE)
# ciudad de Valencia
vut_valencia <- read.csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded-photon3_barrio-distrito.csv",stringsAsFactors = FALSE)
table(vut_valencia$distrito)
vut_valencia <- vut_valencia %>% filter(!is.na(barrio))
# los 1200 que faltaban por geocodificar de valencia
vut_valencia2 <- read.csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar_geocoded-photon_barrio-distrito.csv",stringsAsFactors = FALSE)
vut_valencia <- rbind(vut_valencia,vut_valencia2)
vut <- vut_valencia
# vut <- read_csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded_barrio-distrito.csv")
......@@ -163,7 +175,7 @@ ggplot() +
guides(colour = guide_legend(override.aes = list(size=3)))
# por distrito en Valencia------------
vut.distrito <- group_by(vut_valencia,distrito) %>% summarise( n= n(), na ) %>% arrange(desc(n)) %>% ungroup()
vut.distrito <- group_by(vut_valencia,distrito) %>% summarise( n= n()) %>% arrange(desc(n)) %>% ungroup()
vut.distrito[is.na(vut.distrito$distrito),]$distrito <- "POR CLASIFICAR"
......@@ -188,6 +200,16 @@ ggplot(aes(x = reorder(distrito,n), y = n)) +
caption = "Datos: Generalitat Valenciana. Gráfico: lab.montera34.com/airbnb")
dev.off()
vut.distrito <- group_by(vut_valencia,distrito) %>% summarise( n= n())
vut.distrito[is.na(vut.distrito$distrito),]$distrito <- "POR CLASIFICAR"
vut.distrito[vut.distrito$distrito == "EL PLA DEL REAL",]$distrito <- "PLA DEL REAL"
datos2.distrito$neighbourhood <- as.character(datos2.distrito$neighbourhood)
datos2.distrito[is.na(datos2.distrito$neighbourhood),]$neighbourhood <- "POR CLASIFICAR"
vut.distrito$distrito
datos2.distrito$neighbourhood
# por barrio -----------------------------
vut.barrio <- group_by(vut,barrio) %>% summarise( n= n() ) %>% arrange(desc(n))
......@@ -207,7 +229,27 @@ vut.barrio %>% filter(!barrio == "POR CLASIFICAR") %>%
legend.position = "bottom"
) +
labs(title = "Número de viviendas turísticas por barrioen registro oficial",
subtitle = "Valencia. Marzo 2019. (1.202 por clasificar).",
subtitle = "Valencia. Marzo 2019. (209 por clasificar).",
y = "nº anuncios",
x = "",
caption = "Datos: Generalitat Valenciana. Gráfico: lab.montera34.com/airbnb")
dev.off()
png(filename="images/vut/vut-top25-barrios-valencia-201903.png",width = 600,height = 1100)
vut.barrio %>% filter(!barrio == "POR CLASIFICAR") %>% head(25) %>%
ggplot(aes(x = reorder(barrio,n), y = n)) +
geom_col() + coord_flip() +
geom_text(
aes(label = n,y = n+1),
hjust = 0,
size=3,color="#000000") +
theme_minimal(base_family = "Roboto Condensed", base_size = 14) +
theme(
panel.grid.minor.y = element_blank(), panel.grid.major.y = element_blank(),
legend.position = "bottom"
) +
labs(title = "Número de viviendas turísticas por barrioen registro oficial. Top 25",
subtitle = "Valencia. Marzo 2019. (209 por clasificar).",
y = "nº anuncios",
x = "",
caption = "Datos: Generalitat Valenciana. Gráfico: lab.montera34.com/airbnb")
......
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -37,8 +37,8 @@ calles_valencia = read.csv("data/original/vias-valencia.csv") %>%
# Load dataset and manipulate addresses.
# we only use street name, as there are no many street numbers in Valencia OSM data
# df = read.csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar.csv") %>%
df = read.csv("data/original/190302_viviendas-turisticas-comunidad-valenciana_valencia.csv") %>%
df = read.csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar.csv") %>%
# df = read.csv("data/original/190302_viviendas-turisticas-comunidad-valenciana_valencia.csv") %>%
# df = read.csv("data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar_geocoded-photon2_por-geocodificar.csv") %>%
# select(Signatura,Municipio,Provincia,Address,Teléfono) %>%
separate(Address, c("nombre_es_raw", "num"), extra = "merge",
......@@ -81,15 +81,18 @@ df = df %>%
# Geocoding with Photon ---------------------------------------------------
# Build a new dataframe with desired information.
df2 = df %>%
df2 <- df %>%
select(Address, Signatura, Municipio, tipovia_ca, nomoficial, num) %>%
mutate(full_address_ca = paste(tipovia_ca, nomoficial, Municipio, "Spain",
sep = " ")) %>%
mutate(full_address_ca = paste(tipovia_ca, " ", nomoficial, ", ", Municipio, ", Spain",
sep = "")) %>%
filter(!is.na(nomoficial))
df <- df %>%
mutate(full_address_ca = paste(tipovia_ca, nomoficial, Municipio, "Spain",
sep = " "))
mutate(full_address_ca = paste(tipovia_ca, " ", nomoficial, ", ", Municipio, ", Spain",
sep = ""))
# ubicaciones unicas a geocodificar
length(unique(df2$full_address_ca))
geocoded.df = photon::geocode(unique(df2$full_address_ca), limit = 1,
# lang = "es",
......@@ -125,7 +128,8 @@ ggplot() +
# df.combined <- full_join(df, geocoded.df, by = "row.id")
# add coordinates of streets to VUT
df.combined <- left_join(df,geocoded.df, by = c("full_address_ca" = "location"))
# df.combined <- left_join(df, geocoded.df, by = c("full_address_ca" = "location"))
df.combined <- left_join( select(df,-lat,-lon), geocoded.df, by = c("full_address_ca" = "location"))
# Check locations ----------
table(df.combined$country)
......@@ -133,4 +137,5 @@ table(df.combined$city)
nrow(df.combined %>% filter(city == "Valencia"))
nrow(df.combined %>% filter(!city == "Valencia"))
write.csv(df.combined, file = "data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded-photon3.csv", row.names = FALSE)
# write.csv(df.combined, file = "data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded-photon3.csv", row.names = FALSE)
write.csv(df.combined, file = "data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_por-geocodificar_geocoded-photon.csv", row.names = FALSE)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment