Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #! /usr/bin/env python3
- from bs4 import BeautifulSoup as bs4
- import requests
- import os
- import time
- import threading
- # Making a folder in which to put the threads' html files
- if not os.path.exists("Threads"):
- os.makedirs("Threads")
- # Setting user agent to standard Firefox-on-Windows because the site blocks the script otherwise
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0',
- }
- # Checking to see how many pages there are total so the main loop can have a definite end
- pages_check = requests.get("https://www.fatwallet.com/forums/search/results.php?page=1&type=forums&query=&sort=date&match=titleop&forum=all&active=all", headers=headers)
- pages_soup = bs4(pages_check.text, 'html.parser')
- pages_buttons= pages_soup.select('.standardPageSection .pageLink')
- pages = int(pages_buttons[-1].text.replace(',',''))
- # Checks file to see last page completed. Used for resuming in the event that the download was stopped
- if os.path.isfile('latest_page_done.txt'):
- start_page = int(open('latest_page_done.txt', 'r').read()) + 1
- else:
- start_page = 1
- print('Beginning download at page {}'.format(start_page))
- def getthreads():
- threadid = i['href'].split('/')[-1]
- if not os.path.isfile('Threads/'+threadid+'.html'):
- print('Downloading Thread: ID {}'.format(threadid))
- with open('Threads/'+threadid+'.html', 'w') as f:
- f.write(requests.get('https://www.fatwallet.com/forums/textthread.php?catid=52&threadid='+threadid, headers=headers).text)
- # Main loop for downloading threads
- for p in range(start_page,pages+1):
- res = requests.get("https://www.fatwallet.com/forums/search/results.php?page="+str(p)+"&type=forums&query=&sort=date&match=titleop&forum=all&active=all", headers=headers)
- soup = bs4(res.text, 'html.parser')
- threads = soup.select('.forumTopicListBNS .forumTopicTitle')
- for i in threads:
- t = threading.Thread(target=getthreads)
- t.start()
- time.sleep(0.2)
- print('Page {} finished; Starting page {}'.format(p, p+1))
- with open("latest_page_done.txt", 'w') as f:
- f.write(str(p))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement