Python script to download all datasets for a city from insideairbnb.com

parent f080c721
#!/usr/bin/python2
# this script downloads all the dataset for a city from insideairbnb.com
# this script must be run from scraping folder
# this script has one mandatory parameter: the name of the city
import sys
import urllib2
from bs4 import BeautifulSoup
import time
from datetime import datetime
import re
import os, errno
## vars
today = datetime.now().date().strftime("%Y%m%d")
url = "http://insideairbnb.com/get-the-data.html"
folders = ['../data','../data/original','../data/original/airbnb']
city = sys.argv[1].lower()
outputpath = today+'-ia.urls.log'
hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8', #cambia idioma 'Accept-Language': 'es-ES,es;q=0.8'
'Connection': 'keep-alive'}
## create folders
## https://stackoverflow.com/questions/273192/how-can-i-safely-create-a-nested-directory
for f in folders:
if not os.path.exists(f):
os.makedirs(f)
## measure scraping duration
start = time.time()
## scraping
count=0
with open(outputpath, "w") as logfile:
try:
# Get URL
response = urllib2.Request(url, headers=hdr)
pagedata = urllib2.urlopen(response)
if pagedata.geturl() == url:
html = pagedata.read()
# Get page code
soup = BeautifulSoup(html, "html.parser")
try:
# select city part of the code
cityTable = soup.select("table."+city+" tbody")[0]
for a in cityTable.find_all("a", href=True):
# uncomment the following two lines
# to debug with first 5 urls
#if count == 5:
# break
count +=1
time.sleep(2)
print ""
print count
try:
fileurl = a['href']
# logging url in log file
logfile.write(fileurl+'\n')
filedate = re.sub(r'^.*\d{2}(\d{2})-(\d{2})-(\d{2}).*$',r'\g<1>\g<2>\g<3>',fileurl)
filename = re.sub(r'^.*\/([^\/]*)$',r'\g<1>',fileurl)
filepath = '../data/original/airbnb/'+filedate
filelocal = filepath+'/'+filename
print "Processing "+filelocal+"..."
# checking if this date folder exists
if not os.path.exists(filepath):
os.makedirs(filepath)
print 'Folder '+filepath+' created'
else:
print 'Folder '+filepath+' already exists.'
# checking if this file exists
if not os.path.isfile(filelocal):
# download file
# https://pythonspot.com/http-download-file-with-python/
print 'Trying to download '+fileurl
filedata = urllib2.urlopen(fileurl)
data = filedata.read()
# Write data to file
file_ = open(filelocal, 'w')
file_.write(data)
file_.close()
else:
print 'File '+filelocal+' already exists. Nothing new to download'
except:
print "There was an error while downloading "+fileurl
except:
print "City "+city+" not found in "+url
except:
print "URL "+url+" not found :("
logfile.close()
## measure scraping duration
end = time.time()
print("Scraping duration in seconds:")
print(end - start)
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment