Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import bs4
- import mechanize
- import re
- import shelve
- if __name__ == '__main__':
- agent = [('User-agent', 'Mozilla/5.0 (X11;U;Linux 2.4.2.-2 i586; en-us;m18) Gecko/200010131 Netscape6/6.01')]
- browser = mechanize.Browser()
- browser.set_handle_robots(False)
- browser.addheaders=agent
- browser.open('https://forums.somethingawful.com/account.php?action=loginform')
- browser.form = browser.forms()[0]
- browser['username'] = ''
- browser['password'] = ''
- response = browser.submit()
- response = browser.open('https://forums.somethingawful.com/showthread.php?threadid=3904417&userid=171559&perpage=40&pagenumber=1')
- html_text = str(response.read())
- page_count_element = re.compile('class="pages top".*?div>')
- text = page_count_element.findall(html_text)
- page_count = re.compile('option value="([0-9]+)"')
- number = page_count.finditer(str(text))
- pages = []
- for i in number:
- pages.append(int(i.group(1)))
- pages.sort()
- last_page = pages.pop()
- #print(pages)
- last_page_url = 'https://forums.somethingawful.com/showthread.php?threadid=3904417&userid=171559&perpage=40&pagenumber=%d' %(last_page)
- #print(last_page_url)
- response = browser.open(last_page_url)
- html_text = response.read()
- page = bs4.BeautifulSoup(html_text, 'html.parser')
- posts = page.find_all(class_="post")
- with shelve.open('xlol') as lol_store:
- for i in posts:
- pid = str(i["id"])
- if pid in lol_store.keys():
- if str(i) != lol_store[pid][len(lol_store[pid])-1]:
- lol_store[pid].append(str(i))
- else:
- lol_store[pid] = [str(i)]
- # with shelve.open('xlol') as lol_store:
- # for i in lol_store.keys():
- # print(i)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement