satyaki30

dineline scraper 28-11

Nov 28th, 2016
51
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.57 KB | None | 0 0
  1. import urllib2
  2. from bs4 import BeautifulSoup
  3. import re
  4.  
  5.  
  6. def extract_link(url):
  7.     page = urllib2.urlopen(url)
  8.     return BeautifulSoup(page, 'html.parser')
  9.  
  10.  
  11. def dineline(link):
  12.     page = urllib2.urlopen(link)
  13.     page_text = page.read()
  14.     new_text = re.sub('<br>', '', page_text)
  15.     new_text = re.sub('</br>', '', new_text)
  16.     soup = BeautifulSoup(new_text, 'html.parser')
  17.     resto = soup.find_all('div', class_='ui segment brtop')
  18.     # print resto[2]
  19.     # resto name, time working fine...
  20.     # rating is not...
  21.     one_resto(soup, resto[5]) # index is the entry..
  22.  
  23.  
  24. def one_resto(soup, resto):
  25.     body = resto.find('div', class_='content')
  26.     resto_block = body.find('a')
  27.     name = resto_block.string # name of the restaurant
  28.     link = resto_block.attrs[u'href']
  29.     entity_id = resto_block.attrs[u'data-entity_id']
  30.  
  31.     rev_time = resto.find('time').attrs[u'datetime']
  32.  
  33.     review_block = resto.find('div', class_='act-body mbot0')
  34.     review_block = review_block.find('div', class_='rev-text')
  35.     class_name = 'left bold zdhl2 tooltip icon-font-level-5'
  36.     # rating = float(review_block.find('div', class_=class_name).attrs[u'aria-label'].split()[-1])
  37.     rev_text_block = review_block.find('div', class_='clear')
  38.     review_text = rev_text_block.previous # actual review text
  39.  
  40.     print 'Restaurant: {}, Rating: , Review Date: {}, ' \
  41.           '\n Review Text: {}'.format(name, rev_time, review_text)
  42.  
  43. def main():
  44.     dineline(r"file:\\C:\\Users\\cereal_killer\\PycharmProjects\\zomato\\dl.html")
  45.  
  46.  
  47. if __name__ == '__main__':
  48.     main()
Add Comment
Please, Sign In to add comment