Commit cbdf94a2 authored by numeroteca's avatar numeroteca

fix errors (merged file) in scraping script

parent 082db27c
......@@ -5,13 +5,8 @@ import time
from datetime import datetime
import re
<<<<<<< HEAD
inputfilename = "listings_valencia_datahippo.csv" # nombre del archivo con los ids de los listings, uno por linea
inputfilename = "listings_valencia_datahippo_test.csv" # nombre del archivo con los ids de los listings, uno por linea
inputpath = "../data/original/airbnb/180925/"+inputfilename # nombre del archivo con los ids de los listings, uno por linea
=======
inputfilename = "listings-airbnb_donostia_datahippo.csv" # nombre del archivo con los ids de los listings, uno por linea
inputpath = "../data/original/donostia/datahippo/180926/"+inputfilename # nombre del archivo con los ids de los listings, uno por linea
>>>>>>> 138a9e6f18a8faacc169063f06b347a482588277
today = datetime.now().date().strftime("%Y%m%d")
outputfilename = inputfilename.replace(".csv","")+"_with-last-review-"+today+".csv"
outputpath = "../data/output/"+outputfilename
......@@ -31,11 +26,7 @@ count=0
with open(outputpath, "w") as outfile:
writer = csv.writer(outfile)
<<<<<<< HEAD
writer.writerow(['id','url','longitude','latitude','found','revised','host-id','room_type','bedrooms','capacity','reviews','min_nights','price','reviews_'+today,'lastreview_'+today,'exists_'+today,'host_name','listing_title'])
=======
writer.writerow(['id','url','longitude','latitude','found','revised','host-id','room_type','bedrooms','capacity','reviews','min_nights','price','reviews_'+today,'lastreview_'+today,'exists_'+today])
>>>>>>> 138a9e6f18a8faacc169063f06b347a482588277
with open(inputpath, "r") as f:
csvf = csv.DictReader(f, delimiter=',')
for row in csvf:
......@@ -63,7 +54,6 @@ with open(outputpath, "w") as outfile:
# Get page code
soup = BeautifulSoup(html, "html.parser")
<<<<<<< HEAD
hostName = soup.select("#summary ._a2vnrn0")[0].get_text().encode('utf-8').strip()
listingTitle = soup.select("._15vkz2jm")[0].get_text().encode('utf-8').strip()
print hostName
......@@ -74,6 +64,7 @@ with open(outputpath, "w") as outfile:
nreviews = soup.select("._1dl27thl")[0].get_text().encode('utf-8').strip()
nreviews = nreviews.translate(None, ' Reviews').encode('utf-8').strip()
lastreview = 0
print nreviews
# lastreview = 0
# lastreviews = soup.select("#reviews ._17oldnte")
# lrdates = []
......@@ -86,26 +77,6 @@ with open(outputpath, "w") as outfile:
print "Reviews data found"
except:
=======
try:
nreviewsRaw = soup.select("#reviews ._fecoyn4")[0].get_text().encode('utf-8').strip()
nreviewsArray = re.match('\d{1,}',nreviewsRaw)
nreviews = nreviewsArray.group()
#nreviews = nreviews.translate(None, ' Reviews').encode('utf-8').strip()
lastreviews = soup.select("#reviews ._17oldnte")
lrdates = []
for lr in lastreviews:
lrstr = lr.get_text().encode('utf-8').strip()
lrdate = datetime.strptime(lrstr,'%B %Y')
lrdates.append(lrdate)
lastreview = max(lrdates)
print "Reviews data found"
except:
nreviews = 0
>>>>>>> 138a9e6f18a8faacc169063f06b347a482588277
lastreview = 0
print "No reviews for this listing"
......@@ -113,11 +84,8 @@ with open(outputpath, "w") as outfile:
exists = 0
nreviews = 0
lastreview = 0
<<<<<<< HEAD
hostName = ""
listingTitle = ""
=======
>>>>>>> 138a9e6f18a8faacc169063f06b347a482588277
print "URL not found :("
except:
......@@ -126,11 +94,7 @@ with open(outputpath, "w") as outfile:
lastreview = 0
print "URL not found :("
<<<<<<< HEAD
writer.writerow([row['id'],url,row['longitude'],row['latitude'],row['found'],row['revised'],row['host-id'],row['room_type'],row['bedrooms'],row['capacity'],row['reviews'],row['min_nights'],row['price'],nreviews,lastreview,exists,hostName,listingTitle])
=======
writer.writerow([row['id'],url,row['longitude'],row['latitude'],row['found'],row['revised'],row['host-id'],row['room_type'],row['bedrooms'],row['capacity'],row['reviews'],row['min_nights'],row['price'],nreviews,lastreview,exists])
>>>>>>> 138a9e6f18a8faacc169063f06b347a482588277
print "Data saved."
pagedata.close()
......@@ -144,11 +108,7 @@ count=0
with open(outputpathrev, "w") as outfile:
writer = csv.writer(outfile)
<<<<<<< HEAD
writer.writerow(['id','url','longitude','latitude','found','revised','host-id','room_type','bedrooms','capacity','reviews','min_nights','price','reviews_'+today,'lastreview_'+today,'exists_'+today,'host_name','listing_title'])
=======
writer.writerow(['id','url','longitude','latitude','found','revised','host-id','room_type','bedrooms','capacity','reviews','min_nights','price','reviews_'+today,'lastreview_'+today,'exists_'+today])
>>>>>>> 138a9e6f18a8faacc169063f06b347a482588277
with open(outputpath, "r") as f:
csvf = csv.DictReader(f, delimiter=',')
for row in csvf:
......@@ -178,7 +138,6 @@ with open(outputpathrev, "w") as outfile:
html = pagedata.read()
# Get page code
soup = BeautifulSoup(html, "html.parser")
<<<<<<< HEAD
hostName = soup.select("#summary ._a2vnrn0")[0].get_text().encode('utf-8').strip()
listingTitle = soup.select("._15vkz2jm")[0].get_text().encode('utf-8').strip()
......@@ -200,25 +159,6 @@ with open(outputpathrev, "w") as outfile:
# lastreview = max(lrdates)
print "Reviews data found"
=======
try:
nreviewsRaw = soup.select("#reviews ._fecoyn4")[0].get_text().encode('utf-8').strip()
nreviewsArray = re.match('\d{1,}',nreviewsRaw)
nreviews = nreviewsArray.group()
#nreviews = nreviews.translate(None, ' Reviews').encode('utf-8').strip()
lastreviews = soup.select("#reviews ._17oldnte")
lrdates = []
for lr in lastreviews:
lrstr = lr.get_text().encode('utf-8').strip()
lrdate = datetime.strptime(lrstr,'%B %Y')
lrdates.append(lrdate)
lastreview = max(lrdates)
print "Reviews data found"
>>>>>>> 138a9e6f18a8faacc169063f06b347a482588277
except:
nreviews = row['reviews_'+today]
lastreview = row['lastreview_'+today]
......@@ -243,11 +183,7 @@ with open(outputpathrev, "w") as outfile:
lastreview = row['lastreview_'+today]
<<<<<<< HEAD
writer.writerow([row['id'],url,row['longitude'],row['latitude'],row['found'],row['revised'],row['host-id'],row['room_type'],row['bedrooms'],row['capacity'],row['reviews'],row['min_nights'],row['price'],nreviews,lastreview,exists,hostName,listingTitle])
=======
writer.writerow([row['id'],url,row['longitude'],row['latitude'],row['found'],row['revised'],row['host-id'],row['room_type'],row['bedrooms'],row['capacity'],row['reviews'],row['min_nights'],row['price'],nreviews,lastreview,exists])
>>>>>>> 138a9e6f18a8faacc169063f06b347a482588277
print "Data saved."
## measure scraping duration
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment