Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- elements = (
- # pure html processing
- ('title', '/html/body/div/table/tbody/tr[5]/td/table/tbody/tr/td[3]/table/tbody/tr/td/form/table/tbody/tr/td/span'.replace('tbody','').replace('//','/'), 'a span div p table td'),
- ('img_url', "/html/body/div/table[@class='text']/tbody/tr[5]/td/table[@class='text']/tbody/tr[1]/td[3]/table[@class='text']/tbody/tr/td/form/table[@class='text']/tbody/tr[2]/td/table[@class='text']/tbody/tr[3]/td[1]/table[@class='text']/tbody/tr[1]/td/a/img[@class='border-orange']/@src".replace('tbody','').replace('//','/'), ''),
- ('description', "/html/body/div/table[@class='text']/tbody/tr[5]/td/table[@class='text']/tbody/tr[1]/td[3]/table[@class='text']/tbody/tr/td/form/table[@class='text']/tbody/tr[4]/td/table[@class='text' and position()=1]/tbody/tr[2]/td[@class='line-bottom-grey']".replace('tbody','').replace('//','/'), 'a span div p table td tr li ul ol strong img input'),
- ('location', "/html/body/div/table[@class='text']/tbody/tr[5]/td/table[@class='text']/tbody/tr[1]/td[3]/table[@class='text']/tbody/tr/td/form/table[@class='text']/tbody/tr[2]/td/table[@class='text']/tbody/tr[3]/td[@class='border-gray' and position()=2]/table[@class='text']/tbody/tr[3]/td[@class='line-bottom-grey' and position()=2]/span[@class='data']".replace('tbody','').replace('//','/'), 'a span div p table td'),
- # site-specified
- ('table_info', "/html/body/div/table[@class='text']/tbody/tr[5]/td/table[@class='text']/tbody/tr[1]/td[3]/table[@class='text']/tbody/tr/td/form/table[@class='text']/tbody/tr[2]/td/table[@class='text']/tbody/tr[3]/td[@class='border-gray' and position()=2]/table[@class='text']".replace('tbody','').replace('//','/'), 'a span div p table img td tr strong'),
- # additional processing with regexp
- ('operation', ( "/html/body/div/table[@class='text']/tbody/tr[5]/td/table[@class='text']/tbody/tr[1]/td[3]/table[@class='text']/tbody/tr/td/form/table[@class='text']/tbody/tr[2]/td/table[@class='text']/tbody/tr[3]/td[@class='border-gray' and position()=2]/table[@class='text']/tbody/tr[2]/td[@class='line-bottom-grey' and position()=2]/span[@class='data']".replace('tbody','').replace('//','/'), 'Sale|Rent' ), '' ),
- ('type', ( "/html/body/div/table[@class='text']/tbody/tr[5]/td/table[@class='text']/tbody/tr[1]/td[3]/table[@class='text']/tbody/tr/td/form/table[@class='text']/tbody/tr[2]/td/table[@class='text']/tbody/tr[3]/td[@class='border-gray' and position()=2]/table[@class='text']/tbody/tr[2]/td[@class='line-bottom-grey' and position()=2]/span[@class='data']".replace('tbody','').replace('//','/'), 'House|Condo|Land|Commercial Space|Bars & Restaurants|Guesthouses/Hotels' ), ''),
- ('price', ( "/html/body/div/table[@class='text']/tbody/tr[5]/td/table[@class='text']/tbody/tr[1]/td[3]/table[@class='text']/tbody/tr/td/form/table[@class='text']/tbody/tr[3]/td/table[@class='border-orange']/tbody/tr".replace('tbody','').replace('//','/'), '[0-9, ]+THB*' ), '')
- )
- parsed_url_regexp = 'PropertyID=[0-9]+' # PropertyID=0000003333
- re_urls = 'p=property_details'
- rules = (
- Rule( SgmlLinkExtractor(allow=('',), deny=(re_urls)), follow=True ), # follow links found on any page
- Rule( SgmlLinkExtractor(allow=(re_urls,)), callback='parse_item' ), # extract data from link which URL contains 'p=property_details'
- )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement