Advertisement
iama_alpaca

FW_Scraper_AllForums.py

Oct 7th, 2017
130
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.06 KB | None | 0 0
  1. #! /usr/bin/env python3
  2.  
  3. from bs4 import BeautifulSoup as bs4
  4. import requests
  5. import os
  6. import time
  7. import threading
  8.  
  9. # Making a folder in which to put the threads' html files
  10. if not os.path.exists("Threads"):
  11.     os.makedirs("Threads")
  12.  
  13. # Setting user agent to standard Firefox-on-Windows because the site blocks the script otherwise
  14. headers = {
  15.     'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0',
  16. }
  17.  
  18. # Checking to see how many pages there are total so the main loop can have a definite end
  19. pages_check = requests.get("https://www.fatwallet.com/forums/search/results.php?page=1&type=forums&query=&sort=date&match=titleop&forum=all&active=all", headers=headers)
  20. pages_soup = bs4(pages_check.text, 'html.parser')
  21. pages_buttons= pages_soup.select('.standardPageSection .pageLink')
  22. pages = int(pages_buttons[-1].text.replace(',',''))
  23.  
  24. # Checks file to see last page completed. Used for resuming in the event that the download was stopped
  25. if os.path.isfile('latest_page_done.txt'):
  26.     start_page = int(open('latest_page_done.txt', 'r').read()) + 1
  27. else:
  28.     start_page = 1
  29.  
  30. print('Beginning download at page {}'.format(start_page))
  31.  
  32. def getthreads():
  33.     threadid = i['href'].split('/')[-1]
  34.     if not os.path.isfile('Threads/'+threadid+'.html'):
  35.         print('Downloading Thread: ID {}'.format(threadid))
  36.         with open('Threads/'+threadid+'.html', 'w') as f:
  37.             f.write(requests.get('https://www.fatwallet.com/forums/textthread.php?catid=52&threadid='+threadid, headers=headers).text)
  38.  
  39. # Main loop for downloading threads
  40. for p in range(start_page,pages+1):
  41.     res = requests.get("https://www.fatwallet.com/forums/search/results.php?page="+str(p)+"&type=forums&query=&sort=date&match=titleop&forum=all&active=all", headers=headers)
  42.     soup = bs4(res.text, 'html.parser')
  43.     threads = soup.select('.forumTopicListBNS .forumTopicTitle')
  44.     for i in threads:
  45.         t = threading.Thread(target=getthreads)
  46.         t.start()
  47.         time.sleep(0.2)
  48.     print('Page {} finished; Starting page {}'.format(p, p+1))
  49.     with open("latest_page_done.txt", 'w') as f:
  50.         f.write(str(p))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement