SHOW:
|
|
- or go back to the newest paste.
1 | from bs4 import BeautifulSoup | |
2 | import requests | |
3 | from requests import session | |
4 | ||
5 | login_page = "http://forums.somethingawful.com/account.php?action=loginform#form" | |
6 | url = "http://forums.somethingawful.com/query.php?action=posthistory&userid=35304&page=" | |
7 | USERNAME = '' | |
8 | PASSWORD = '' | |
9 | - | payload = { |
9 | + | payload = { #stores our log in info to authenticate with the forums |
10 | - | 'action': 'login', |
10 | + | 'action': 'login', |
11 | 'username': USERNAME, | |
12 | 'password': PASSWORD | |
13 | } | |
14 | - | """with session() as c: |
14 | + | |
15 | - | for x in range(1,101): |
15 | + | def checkDiv(string): #this is what im trying to use to filter out empty posts and posts that contain quotes |
16 | - | c.post(login_page, data=payload) |
16 | + | toriPost = string |
17 | - | response = c.get(url+x) |
17 | + | if string is False: |
18 | - | soup = BeautifulSoup(response.text, 'html.parser') |
18 | + | toriPost = "its u" |
19 | - | mydivs = soup.findAll("div", {"class" : "blurb"}) |
19 | + | elif "[quote=" is in string: |
20 | - | my_file = open("tori.txt", "a") |
20 | + | toriPost = "its u" |
21 | - | for div in mydivs: |
21 | + | else: |
22 | - | my_file.write(div.string + "\n") |
22 | + | toriPost = string |
23 | - | my_file.close() """ |
23 | + | return toriPost |
24 | - | with session() as c: |
24 | + | |
25 | - | c.post(login_page, data = payload) |
25 | + | with session() as c: #so i think what this does is create a session and closes that session automatically? |
26 | - | for x in range(1,101): |
26 | + | c.post(login_page, data = payload) |
27 | - | response = c.get(url + str(x)) |
27 | + | for x in range(1,101): #generating the page number of post history. 1-100 is what is available |
28 | - | soup = BeautifulSoup(response.text, 'html.parser') |
28 | + | response = c.get(url + str(x)) #grabs the html? of the page + post number generated |
29 | soup = BeautifulSoup(response.text, 'html.parser') #parse with BS4 | |
30 | mydivs = soup.findAll("div", {"class" : "blurb"}) | |
31 | - | my_file.write("\n" + "\n" + "\n" + "\n" + "Page number: " + str(x) + "\n" + "\n" + "\n" + "\n") |
31 | + | |
32 | for div in mydivs: | |
33 | - | if "[quote=" not in div.string: |
33 | + | my_file.write(checkDiv(div.string) + "\n") |
34 | - | my_file.write(div.string + "\n") |
34 | + |