jxsl13

BeautifulSoup Python Example

Aug 19th, 2019
328
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.42 KB | None | 0 0
  1. #!/usr/bin/env python3
  2.  
  3. from bs4 import BeautifulSoup
  4. from urllib.request import urlopen, Request
  5. import re
  6.  
  7.  
  8. if __name__ == "__main__":
  9.  
  10.     # full url with htttps://...
  11.     # use a custom user agent, as the python default one is blocked everywhere
  12.     req = Request(
  13.         "https://YOUR_WEBSITE",
  14.         data=None,
  15.         headers={
  16.             'User-Agent': 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_6; en-en) AppleWebKit/533.19.4 (KHTML, like Gecko) Version/5.0.3 Safari/533.19.4'
  17.         }
  18.     )
  19.    
  20.     html = urlopen(req)
  21.  
  22.     # create a yummy soup from html
  23.     soup = BeautifulSoup(html, 'html.parser')
  24.  
  25.     # find your table by id
  26.     tbody = soup.find('table', {'id' : 'tableInfoParcels'}).tbody
  27.  
  28.     # get all td tags
  29.     tds = [tr.td for tr in tbody.find_all('tr')]
  30.  
  31.     # get all texts between <b>...text</b>
  32.     captions = [td.b.get_text() for td in tds]
  33.  
  34.  
  35.     # get that address etc text from within the <b>....text...</b>
  36.     texts = [td.b.get_text() for td in tds]
  37.  
  38.     info_texts = []
  39.     for text in texts:
  40.         # we want the data that is between (...) the parenthesis ans we want
  41.         # for the . to be able to match new line characters.
  42.         found = re.search(r"<b.*>.*</b>.*\"(.*)\"", text, re.DOTALL)
  43.  
  44.         if found:
  45.             info_texts.append(found.group(1))
  46.    
  47.  
  48.     for caption, info_text in zip(captions, info_texts):
  49.         print(caption, info_text)
Advertisement
Add Comment
Please, Sign In to add comment