Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- from bs4 import BeautifulSoup
- from urllib.request import urlopen, Request
- import re
- if __name__ == "__main__":
- # full url with htttps://...
- # use a custom user agent, as the python default one is blocked everywhere
- req = Request(
- "https://YOUR_WEBSITE",
- data=None,
- headers={
- 'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-en) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4'
- }
- )
- html = urlopen(req)
- # create a yummy soup from html
- soup = BeautifulSoup(html, 'html.parser')
- # find your table by id
- tbody = soup.find('table', {'id' : 'tableInfoParcels'}).tbody
- # get all td tags
- tds = [tr.td for tr in tbody.find_all('tr')]
- # get all texts between <b>...text</b>
- captions = [td.b.get_text() for td in tds]
- # get that address etc text from within the <b>....text...</b>
- texts = [td.b.get_text() for td in tds]
- info_texts = []
- for text in texts:
- # we want the data that is between (...) the parenthesis ans we want
- # for the . to be able to match new line characters.
- found = re.search(r"<b.*>.*</b>.*\"(.*)\"", text, re.DOTALL)
- if found:
- info_texts.append(found.group(1))
- for caption, info_text in zip(captions, info_texts):
- print(caption, info_text)
Advertisement
Add Comment
Please, Sign In to add comment