Advertisement
Guest User

Untitled

a guest
Jul 22nd, 2017
60
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 3.40 KB | None | 0 0
  1.     elements = (
  2.             # pure html processing
  3.             ('title', '/html/body/div/table/tbody/tr[5]/td/table/tbody/tr/td[3]/table/tbody/tr/td/form/table/tbody/tr/td/span'.replace('tbody','').replace('//','/'), 'a span div p table td'),
  4.             ('img_url', "/html/body/div/table[@class='text']/tbody/tr[5]/td/table[@class='text']/tbody/tr[1]/td[3]/table[@class='text']/tbody/tr/td/form/table[@class='text']/tbody/tr[2]/td/table[@class='text']/tbody/tr[3]/td[1]/table[@class='text']/tbody/tr[1]/td/a/img[@class='border-orange']/@src".replace('tbody','').replace('//','/'), ''),
  5.             ('description', "/html/body/div/table[@class='text']/tbody/tr[5]/td/table[@class='text']/tbody/tr[1]/td[3]/table[@class='text']/tbody/tr/td/form/table[@class='text']/tbody/tr[4]/td/table[@class='text' and position()=1]/tbody/tr[2]/td[@class='line-bottom-grey']".replace('tbody','').replace('//','/'), 'a span div p table td tr li ul ol strong img input'),
  6.             ('location', "/html/body/div/table[@class='text']/tbody/tr[5]/td/table[@class='text']/tbody/tr[1]/td[3]/table[@class='text']/tbody/tr/td/form/table[@class='text']/tbody/tr[2]/td/table[@class='text']/tbody/tr[3]/td[@class='border-gray' and position()=2]/table[@class='text']/tbody/tr[3]/td[@class='line-bottom-grey' and position()=2]/span[@class='data']".replace('tbody','').replace('//','/'), 'a span div p table td'),
  7.            
  8.             # site-specified
  9.             ('table_info', "/html/body/div/table[@class='text']/tbody/tr[5]/td/table[@class='text']/tbody/tr[1]/td[3]/table[@class='text']/tbody/tr/td/form/table[@class='text']/tbody/tr[2]/td/table[@class='text']/tbody/tr[3]/td[@class='border-gray' and position()=2]/table[@class='text']".replace('tbody','').replace('//','/'), 'a span div p table img td tr strong'),
  10.            
  11.             # additional processing with regexp
  12.             ('operation', ( "/html/body/div/table[@class='text']/tbody/tr[5]/td/table[@class='text']/tbody/tr[1]/td[3]/table[@class='text']/tbody/tr/td/form/table[@class='text']/tbody/tr[2]/td/table[@class='text']/tbody/tr[3]/td[@class='border-gray' and position()=2]/table[@class='text']/tbody/tr[2]/td[@class='line-bottom-grey' and position()=2]/span[@class='data']".replace('tbody','').replace('//','/'), 'Sale|Rent' ), '' ),
  13.             ('type', ( "/html/body/div/table[@class='text']/tbody/tr[5]/td/table[@class='text']/tbody/tr[1]/td[3]/table[@class='text']/tbody/tr/td/form/table[@class='text']/tbody/tr[2]/td/table[@class='text']/tbody/tr[3]/td[@class='border-gray' and position()=2]/table[@class='text']/tbody/tr[2]/td[@class='line-bottom-grey' and position()=2]/span[@class='data']".replace('tbody','').replace('//','/'), 'House|Condo|Land|Commercial Space|Bars & Restaurants|Guesthouses/Hotels' ), ''),
  14.             ('price', ( "/html/body/div/table[@class='text']/tbody/tr[5]/td/table[@class='text']/tbody/tr[1]/td[3]/table[@class='text']/tbody/tr/td/form/table[@class='text']/tbody/tr[3]/td/table[@class='border-orange']/tbody/tr".replace('tbody','').replace('//','/'), '[0-9, ]+THB*' ), '')
  15.     )
  16.        
  17.     parsed_url_regexp = 'PropertyID=[0-9]+' # PropertyID=0000003333
  18.    
  19.     re_urls = 'p=property_details'
  20.    
  21.     rules = (
  22.         Rule( SgmlLinkExtractor(allow=('',), deny=(re_urls)), follow=True ), # follow links found on any page
  23.         Rule( SgmlLinkExtractor(allow=(re_urls,)), callback='parse_item' ), # extract data from link which URL contains 'p=property_details'
  24.     )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement