FW_Scraper_AllForums.py

#! /usr/bin/env python3

from bs4 import BeautifulSoup as bs4
import requests
import os
import time
import threading

# Making a folder in which to put the threads' html files
if not os.path.exists("Threads"):
    os.makedirs("Threads")

# Setting user agent to standard Firefox-on-Windows because the site blocks the script otherwise
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:55.0) Gecko/20100101 Firefox/55.0',
}

# Checking to see how many pages there are total so the main loop can have a definite end
pages_check = requests.get("https://www.fatwallet.com/forums/search/results.php?page=1&type=forums&query=&sort=date&match=titleop&forum=all&active=all", headers=headers)
pages_soup = bs4(pages_check.text, 'html.parser')
pages_buttons= pages_soup.select('.standardPageSection .pageLink')
pages = int(pages_buttons[-1].text.replace(',',''))

# Checks file to see last page completed. Used for resuming in the event that the download was stopped
if os.path.isfile('latest_page_done.txt'):
    start_page = int(open('latest_page_done.txt', 'r').read()) + 1
else:
    start_page = 1

print('Beginning download at page {}'.format(start_page))

def getthreads():
    threadid = i['href'].split('/')[-1]
    if not os.path.isfile('Threads/'+threadid+'.html'):
        print('Downloading Thread: ID {}'.format(threadid))
        with open('Threads/'+threadid+'.html', 'w') as f:
            f.write(requests.get('https://www.fatwallet.com/forums/textthread.php?catid=52&threadid='+threadid, headers=headers).text)

# Main loop for downloading threads
for p in range(start_page,pages+1):
    res = requests.get("https://www.fatwallet.com/forums/search/results.php?page="+str(p)+"&type=forums&query=&sort=date&match=titleop&forum=all&active=all", headers=headers)
    soup = bs4(res.text, 'html.parser')
    threads = soup.select('.forumTopicListBNS .forumTopicTitle')
    for i in threads:
        t = threading.Thread(target=getthreads)
        t.start()
        time.sleep(0.2)
    print('Page {} finished; Starting page {}'.format(p, p+1))
    with open("latest_page_done.txt", 'w') as f:
        f.write(str(p))