Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Website: https://www.twinkledeals.com
- import dryscrape
- import lxml.html
- from scrappers.page import *
- # Should return a list
- # ret[0] - descriptions[]
- # ret[1] - images[]
- # Create one session for all items from this website
- session = dryscrape.Session()
- # Used to store extracted data
- data = [[], []]
- # Extract from <div id='description'>
- def extractText(root):
- for node in root:
- if node.text != None:
- data[0].append(node.text)
- elif node.tag == 'br':
- data[0].append('\n')
- extractText(node)
- # Extract from image divs
- def extractImages(root):
- for node in root:
- if node.tag == 'img':
- data[1].append(node.get('data-original')[2:])
- extractImages(node)
- def scrape_59916(link):
- # Clear data
- data = [[], []]
- # Visit link, then extract text and images
- html_source = getPage(session, link)
- root = lxml.html.fromstring(html_source)
- for node in root.xpath('//div[@style="font-family:Tahoma;"]'):
- extractText(node)
- for node in root.xpath('//div[@class="xxkkk"]'):
- extractText(node)
- for node in root.xpath('//div[@id="js_n_bigImg"]'):
- extractImages(node)
- for node in root.xpath('//div[@id="js_n_thumbImg"]'):
- extractImages(node)
- return data
- # Testing purpose
- if __name__ == '__main__':
- link = 'http://www.shareasale.com/m-pr.cfm?merchantID=67349&userID=1263175&productID=677766151'
- print(scrape_59916(link))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement