Advertisement
Guest User

scrape

a guest
Aug 23rd, 2017
101
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.49 KB | None | 0 0
  1. # Website: https://www.twinkledeals.com
  2.  
  3. import dryscrape
  4. import lxml.html
  5. from scrappers.page import *
  6.  
  7. # Should return a list
  8. # ret[0] - descriptions[]
  9. # ret[1] - images[]
  10.  
  11. # Create one session for all items from this website
  12. session = dryscrape.Session()
  13.  
  14. # Used to store extracted data
  15. data = [[], []]
  16.  
  17. # Extract from <div id='description'>
  18. def extractText(root):
  19.     for node in root:
  20.         if node.text != None:
  21.             data[0].append(node.text)
  22.         elif node.tag == 'br':
  23.             data[0].append('\n')
  24.  
  25.         extractText(node)
  26.  
  27. # Extract from image divs
  28. def extractImages(root):
  29.     for node in root:
  30.         if node.tag == 'img':
  31.             data[1].append(node.get('data-original')[2:])
  32.  
  33.         extractImages(node)
  34.  
  35. def scrape_59916(link):
  36.     # Clear data
  37.     data = [[], []]
  38.  
  39.     # Visit link, then extract text and images
  40.     html_source = getPage(session, link)
  41.     root = lxml.html.fromstring(html_source)
  42.     for node in root.xpath('//div[@style="font-family:Tahoma;"]'):
  43.         extractText(node)
  44.     for node in root.xpath('//div[@class="xxkkk"]'):
  45.         extractText(node)
  46.     for node in root.xpath('//div[@id="js_n_bigImg"]'):
  47.         extractImages(node)
  48.     for node in root.xpath('//div[@id="js_n_thumbImg"]'):
  49.         extractImages(node)
  50.  
  51.     return data
  52.  
  53. # Testing purpose
  54. if __name__ == '__main__':
  55.     link = 'http://www.shareasale.com/m-pr.cfm?merchantID=67349&userID=1263175&productID=677766151'
  56.     print(scrape_59916(link))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement