SHOW:
|
|
- or go back to the newest paste.
1 | from bs4 import BeautifulSoup | |
2 | ||
3 | import urllib2 #http://love-python.blogspot.nl/2008/02/get-html-source-of-url.html | |
4 | ||
5 | url = 'http://services.runescape.com/m=itemdb_rs/top100.ws' | |
6 | ||
7 | usock = urllib2.urlopen(url) | |
8 | data = usock.read() | |
9 | usock.close() | |
10 | ||
11 | - | soup = BeautifulSoup(BeautifulSoup(data).find_all('tbody')[0]) |
11 | + | soup = BeautifulSoup(data) |
12 | ||
13 | - | tr = soup.find_all("tr") |
13 | + | tr = soup.find_all("tr", data-item-id=True) # http://www.crummy.com/software/BeautifulSoup/bs4/doc/#the-keyword-arguments - getting all the data-item-id trs |
14 | obj = {} | |
15 | for thistr in tr: | |
16 | - | bstr = BeautifulSoup(thistr) |
16 | + | BStr = BeautifulSoup(thistr) |
17 | - | td = bstr.find_all('td') |
17 | + | td = BStr.find_all('td') |
18 | - | obj[bstr['data-item-id']] = BeautifulSoup(td[5]).string |
18 | + | obj[BStr['data-item-id']] = td[5].string # http://www.crummy.com/software/BeautifulSoup/bs4/doc/#contents-and-children - You can just keep working with bs after the [n] |
19 | ||
20 | #obj now contains all data in the format {"2":"107.6m",...} |