Commit 76489b49 authored by numeroteca's avatar numeroteca
parents 3891de6d d663090e
library(tidyverse)
datos <- read.csv("./data/original/airbnb/190227/listings_valencia_insideairbnb.csv")
# Cargamos los datos de insideairbnb
datos <- datos %>% mutate(legal = ifelse(license =="","Sin licencia","Con licencia"))
# seleccionamos los apartamentos enteros
datos2 <- datos %>% filter(room_type=="Entire home/apt")
# grafico barras licencias/no licencias
ggplot(datos2,aes(neighbourhood,fill=legal)) +
geom_bar() +
labs(title="Distribución de licencias por barrio",
x= "Licence",
y="Neighbourhood") +
geom_text(stat='count',aes(label=..count..),
position = "stack",
vjust=1,
size=3,
color="black") +
theme(axis.text.x=element_text(angle = 90, hjust = 1))
# grafico anterior girando ejes
ggplot(datos2,aes(neighbourhood,fill=legal)) +
geom_bar() +
labs(title="Distribución de licencias por barrio",
x= "Licence",
y="Neighbourhood") +
geom_text(stat='count',aes(label=..count..),
position = "stack",
vjust=1,
hjust=1,
size=3,
color="black") +
coord_flip()
# grafico anterior sobre 100% cada barra
ggplot(datos2,aes(neighbourhood)) +
geom_bar(aes(fill=legal),position="fill") +
labs(title="Distribución de licencias por barrio",
x= "Licence",
y="Neighbourhood") +
coord_flip()
# identifica numero de 5 cifras como licencia
library(stringr)
# nos quedmos con apartamentos enteros
datos2 <- datos %>% filter(room_type=="Entire home/apt")
datos2 <- datos2 %>% mutate(legal_note1 = ifelse(is.na(str_extract(notes,"[:punctuation:]?[:blank:]?\\d{3,5}")),"","Con licencia"))
datos2 <- datos2 %>% mutate(legal_note2 = ifelse(is.na(str_extract(name,"[:punctuation:]?[:blank:]?\\d{3,5}")),"","Con licencia"))
datos2 <- datos2 %>% mutate(legal_note3 = ifelse(is.na(str_extract(summary,"[:punctuation:]?[:blank:]?\\d{3,5}")),"","Con licencia"))
datos2 <- datos2 %>% mutate(legal_note4 = ifelse(is.na(str_extract(description,"[:punctuation:]?[:blank:]?\\d{3,5}")),"","Con licencia"))
datos2 <- datos2 %>% mutate(legal_note5 = ifelse(is.na(str_extract(space,"[:punctuation:]?[:blank:]?\\d{3,5}")),"","Con licencia"))
datos2 <- datos2 %>% mutate(legal_note = ifelse(legal=="Con licencia" | legal_note1=="Con licencia" | legal_note2=="Con licencia" | legal_note3=="Con licencia" | legal_note4=="Con licencia" | legal_note5=="Con licencia",
"Con licencia", "Sin licencia"))
datos2 %>% group_by(legal) %>% count()
datos2 %>% group_by(legal_note) %>% count()
write.csv(datos2,"datos_licencias.csv")
# grafico anterior girando ejes
ggplot(datos2,aes(neighbourhood,fill=legal_note)) +
geom_bar() +
labs(title="Distribución de licencias revisadas por barrio",
x= "Licencias revisadas",
y="Neighbourhood") +
geom_text(stat='count',aes(label=..count..),
position = "stack",
vjust=1,
hjust=1,
size=3,
color="black") +
coord_flip()
# grafico anterior sobre 100% cada barra
ggplot(datos2,aes(neighbourhood)) +
geom_bar(aes(fill=legal_note),position="fill") +
labs(title="Distribución de licencias revisadas por barrio",
x= "Licence revisadas",
y="Neighbourhood") +
coord_flip()
# nos quedmos con apartamentos enteros y numero de reviews cero y tasa de respuesta del host 0 (indicadores de apartamento inactivo)
datos3 <- datos2 %>% filter(number_of_reviews!=0 & host_response_rate!=0)
datos3 <- datos3 %>% mutate(legal_note1 = ifelse(is.na(str_extract(notes,"[:punctuation:]?[:blank:]?\\d{3,5}")),"","Con licencia"))
datos3 <- datos3 %>% mutate(legal_note2 = ifelse(is.na(str_extract(name,"[:punctuation:]?[:blank:]?\\d{3,5}")),"","Con licencia"))
datos3 <- datos3 %>% mutate(legal_note3 = ifelse(is.na(str_extract(summary,"[:punctuation:]?[:blank:]?\\d{3,5}")),"","Con licencia"))
datos3 <- datos3 %>% mutate(legal_note4 = ifelse(is.na(str_extract(description,"[:punctuation:]?[:blank:]?\\d{3,5}")),"","Con licencia"))
datos3 <- datos3 %>% mutate(legal_note5 = ifelse(is.na(str_extract(space,"[:punctuation:]?[:blank:]?\\d{3,5}")),"","Con licencia"))
datos3 <- datos3 %>% mutate(legal_note = ifelse(legal=="Con licencia" | legal_note1=="Con licencia" | legal_note2=="Con licencia" | legal_note3=="Con licencia" | legal_note4=="Con licencia" | legal_note5=="Con licencia",
"Con licencia", "Sin licencia"))
datos3 %>% group_by(legal_note) %>% count()
write.csv(datos3,"datos_licencias_activos.csv")
# grafico anterior girando ejes
ggplot(datos3,aes(neighbourhood,fill=legal_note)) +
geom_bar() +
labs(title="Distribución de licencias activos revisadas por barrio",
x= "Licencias revisadas",
y="Neighbourhood") +
geom_text(stat='count',aes(label=..count..),
position = "stack",
vjust=1,
hjust=1,
size=3,
color="black") +
coord_flip()
# grafico anterior sobre 100% cada barra
ggplot(datos3,aes(neighbourhood)) +
geom_bar(aes(fill=legal_note),position="fill") +
labs(title="Distribución de licencias activos revisadas por barrio",
x= "Licence revisadas",
y="Neighbourhood") +
coord_flip()
#### scrapping de licencias
#- scrapping licencias generalitat: http://comunitatvalenciana.com/viaje/alojamiento/viviendas-turisticas
library("rvest")
library("tidyverse")
content <- read_html("http://comunitatvalenciana.com/viaje/alojamiento/viviendas-turisticas?page=1")
body_table <- content %>% html_nodes('body') %>%
html_nodes('table') %>%
html_table(dec = ",")
my_table <- body_table[[1]]
adress <- "http://comunitatvalenciana.com/viaje/alojamiento/viviendas-turisticas?page="
#- hay 3642 paginas
my_names <-names(my_table)
#- 76 peta x Error in match.names(clabs, names(xi)) : names do not match previous names
n_pags <- 3642
for (ii in 2:n_pags){ #- n_pags
adress_n <- paste0(adress, ii)
content_n <- read_html(adress_n)
body_table_n <- content_n %>% html_nodes('body') %>%
html_nodes('table') %>%
html_table(dec = ",")
my_table_n <- body_table_n[[1]]
names(my_table_n) <- my_names
my_table <- rbind(my_table, my_table_n)
print(ii)
# Esperamos un poco (3 minutos) cada 500 accesos
if(ii%%500 == 0) {
cat(ii, 'esperando...')
Sys.sleep(180)
}
}
<?php
$csv_filename = '../data/original/190302_viviendas-turisticas-comunidad-valenciana_valencia.csv';
$out_filename = '../data/output/190302_viviendas-turisticas-comunidad-valenciana_valencia_geocoded.csv';
$line_length = "4096"; // max line lengh (increase in case you have longer lines than 1024 characters)
$delimiter = ","; // field delimiter character
$enclosure = '"'; // field enclosure character
// Geocoding script using Nominatim http://nominatim.openstreetmap.org/
// to get coordinates using City, Country, street name, house number and Postal Code
function geocode_it( $country='',$state='',$city='',$street_name='',$house_number='') {
// use nominatim geocoding service to get coords
$q = "https://nominatim.openstreetmap.org/search?format=json&country=".urlencode($country)."&city=".urlencode($city)."&state=".urlencode($state)."&street=".urlencode($house_number." ".$street_name)."&limit=1";
// we use curl instead of file_get_contents for not being blocked by nominatim
//$results_json = file_get_contents($q);
$headers = [
'X-Apple-Tz: 0',
'X-Apple-Store-Front: 143444,12',
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Encoding: gzip, deflate',
'Accept-Language: en-US,en;q=0.5',
'Cache-Control: no-cache',
'Content-Type: application/x-www-form-urlencoded; charset=utf-8',
'Host: www.example.com',
'Referer: http://www.example.com/index.php', //Your referrer address
'User-Agent: Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0',
'X-MicrosoftAjax: Delta=true'
];
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,$q);
//curl_setopt($ch, CURLOPT_POST, 1);
//curl_setopt($ch, CURLOPT_POSTFIELDS,$vars); //Post Fields
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$curl_response = curl_exec ($ch);
curl_close ($ch);
$results = json_decode($curl_response,TRUE); // if second parameter is set to TRUE, the output is ass. array
if ( !array_key_exists('0',$results) ) {
$q = "https://nominatim.openstreetmap.org/search?format=json&q=".urlencode($country." ".$city." ".$street_name. " " .$street_number). "&limit=1";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,$q);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
$curl_response = curl_exec ($ch);
curl_close ($ch);
$results = json_decode($curl_response,TRUE);
}
if ( array_key_exists('0',$results) )
return $results[0];
return;
}
// open the original file
$fp = fopen($csv_filename,'r');
// get data and store it in array
if ( $fp !== FALSE ) { // if the file exists and is readable
// data array generation
$data = array();
$output = array();
$output[] = '"Signatura","Municipio","Provincia","Address","Teléfono","lat","lon"'.PHP_EOL;
$line = -1;
while ( ($fp_csv = fgetcsv($fp,$line_length,$delimiter,$enclosure)) !== FALSE ) { // begin main loop
$line++;
if ( $line == 0 )
continue;
// debug
//if ( $line == 6 )
// break;
echo "Geocoding register ".$line."...";
echo "\r\n";
$country = 'Spain';
$state = 'Valencia';
$city = 'Valencia';
$address = $fp_csv[3];
$pattern = '/^[a-zA-Z]* ([^,]*), .*/i';
$rep = '${1}';
$street_name = preg_replace($pattern, $rep, $address);
$pattern = '/^[^,]*,[^0-9]*([0-9]*).*/i';
$rep = '${1}';
$street_number = preg_replace($pattern, $rep, $address);
$geo = geocode_it($country,$state,$city,$street_name,$street_number);
$lat = '';
if ( $geo['lat'] != '' )
$lat = $geo['lat'];
$lon = '';
if ( $geo['lon'] != '' )
$lon = $geo['lon'];
if ( $lat == '' || $lon == '' )
echo 'Geocoding failed for record '.$line.'.';
else
echo 'Geocoding succeded for record '.$line.'.';
echo "\r\n";
$output[] = '"'.$fp_csv[0].'","'.$fp_csv[1].'","'.$fp_csv[2].'","'.$fp_csv[3].'","'.$fp_csv[4].'","'.$lat.'","'.$lon.'"'.PHP_EOL;
}
fclose($fp);
// open the output file to write
$fp = fopen($out_filename,'w');
// get data and store it in array
if ( $fp !== FALSE ) { // if the file exists and is readable
echo 'Writing geodata to '.$out_filename;
echo "\r\n";
foreach ( $output as $o ) {
fwrite($fp,$o);
}
fclose($fp);
echo 'Geocoding ended.';
echo "\r\n";
}
}
?>
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment