Guest User

Untitled

a guest
Feb 20th, 2018
63
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.93 KB | None | 0 0
  1. #!/usr/bin/env python
  2.  
  3. import sys
  4. import urllib2
  5. from pprint import pprint
  6. from bs4 import BeautifulSoup
  7.  
  8. def dataList(element):
  9. categoryList = []
  10. try:
  11. for ul in categorySoup('ul', recursive=True):
  12. for li in ul('li', recursive=True):
  13. categoryList.append(li.a.contents)
  14. categoryList.append("new ccategory");
  15.  
  16.  
  17. return categoryList
  18. except:
  19. return ['broken!']
  20.  
  21. categories = ['20081', '550', '2984', '267', '12576', '625', '15032', '11450', '11116', '1', '58058', '293', '14339', '237', '11232', '45100', '99', '172008', '26395', '11700', '281', '11233', '619', '1281', '870', '10542', '316', '888', '64482', '260', '1305', '220', '3252', '1249']
  22.  
  23. print "nSetting user agent...",
  24. user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3'
  25. print "DONE"
  26.  
  27. print "Setting headers...",
  28. headers = { 'User-Agent' : user_agent }
  29. print "DONE"
  30.  
  31. data = {}
  32.  
  33. print "Iterating through dictionary of categoriesn"
  34. for rootID in categories:
  35. print "Requesting source code...",
  36. url = 'http://www.isoldwhat.com/getcats/fullcategorytree.php?RootID=%s' % rootID
  37. req = urllib2.Request(url, None, headers)
  38. response = urllib2.urlopen(req)
  39. print "DONE"
  40.  
  41. print "Turning HTML into soup..."
  42. text = response.read()
  43. soup = BeautifulSoup(text, 'html.parser')
  44. categorySoup = soup.find('div', id='catnumbers')
  45. print "DONE"
  46.  
  47. print "Parsing data...",
  48. pprint(dataList(categorySoup))
  49. print "DONEn"
  50.  
  51. response.close() # its always safe to close an open connection
  52. sys.exit()
  53.  
  54. print "Turning data into JSON...",
  55. #data = find_li(soup)
  56. data = json.dumps(data, ensure_ascii=False)
  57. print "DONEn"
  58.  
  59. print "Finished doing. Enjoy!"
  60.  
  61. def getCategory(root):
  62. children = root.contents
  63. if len(children) == 0:
  64. return root
  65. else:
  66. return root.append(getCategory(e) for e in children))
Add Comment
Please, Sign In to add comment