Guest User

Untitled

a guest
Feb 24th, 2018
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 0.65 KB | None | 0 0
  1. import requests
  2. from bs4 import BeautifulSoup
  3. import re
  4.  
  5. SOURCE_URL = '<some url>'
  6. r = requests.get(SOURCE_URL)
  7. soup = BeautifulSoup(str(r.content, encoding='shift_jis'), 'lxml')
  8.  
  9.  
  10. pattern = r'^(\d+):\s(.+)\s\((\d+)\)'
  11.  
  12.  
  13. def thread_id(key):
  14. return re.sub(r'/l\d+', '', key)
  15.  
  16.  
  17. def parse_title(a):
  18. title = a.text
  19. groups = re.search(pattern, title)
  20. if not groups:
  21. return None
  22.  
  23. return {
  24. 'num': int(groups.group(1)),
  25. 'title': groups.group(2),
  26. 'res': int(groups.group(3)),
  27. 'url_key': thread_id(a.get('href'))
  28. }
  29.  
  30.  
  31. for a in soup.find_all('a'):
  32. th_data = parse_title(a)
  33. if th_data:
  34. print(th_data)
Add Comment
Please, Sign In to add comment