Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- import sys
- import urllib2
- from pprint import pprint
- from bs4 import BeautifulSoup
- def dataList(element):
- categoryList = []
- try:
- for ul in categorySoup('ul', recursive=True):
- for li in ul('li', recursive=True):
- categoryList.append(li.a.contents)
- categoryList.append("new ccategory");
- return categoryList
- except:
- return ['broken!']
- categories = ['20081', '550', '2984', '267', '12576', '625', '15032', '11450', '11116', '1', '58058', '293', '14339', '237', '11232', '45100', '99', '172008', '26395', '11700', '281', '11233', '619', '1281', '870', '10542', '316', '888', '64482', '260', '1305', '220', '3252', '1249']
- print "nSetting user agent...",
- user_agent = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.472.63 Safari/534.3'
- print "DONE"
- print "Setting headers...",
- headers = { 'User-Agent' : user_agent }
- print "DONE"
- data = {}
- print "Iterating through dictionary of categoriesn"
- for rootID in categories:
- print "Requesting source code...",
- url = 'http://www.isoldwhat.com/getcats/fullcategorytree.php?RootID=%s' % rootID
- req = urllib2.Request(url, None, headers)
- response = urllib2.urlopen(req)
- print "DONE"
- print "Turning HTML into soup..."
- text = response.read()
- soup = BeautifulSoup(text, 'html.parser')
- categorySoup = soup.find('div', id='catnumbers')
- print "DONE"
- print "Parsing data...",
- pprint(dataList(categorySoup))
- print "DONEn"
- response.close() # its always safe to close an open connection
- sys.exit()
- print "Turning data into JSON...",
- #data = find_li(soup)
- data = json.dumps(data, ensure_ascii=False)
- print "DONEn"
- print "Finished doing. Enjoy!"
- def getCategory(root):
- children = root.contents
- if len(children) == 0:
- return root
- else:
- return root.append(getCategory(e) for e in children))
Add Comment
Please, Sign In to add comment