Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #/bin/python
- from bs4 import BeautifulSoup
- import urllib.request
- import time
- currMain = "./currMain.html"
- redditURL = "https://www.reddit.com/"
- posts = open('./posts/allPosts', 'a+') #read, append, byte
- postsRead = open('./posts/allPosts', 'r')
- postsRead = postsRead.readlines()
- soup = BeautifulSoup(open(currMain))
- # f = urllib.request.urlopen(redditURL)
- # print(f.read().decode('utf-8'))
- things = soup.findAll("div", class_="thing")
- i=0
- for thing in soup.find("div", class_="thing").next_siblings:
- titleSoup = thing.find("p", class_="title")
- if titleSoup:
- title = list(titleSoup.children)[0].string
- url = redditURL[:-1] + list(titleSoup)[0]['href']
- else:
- continue
- i=i+1
- print(str(i) + ": Waiting for next pull")
- if (url+"\n") in postsRead:
- print("Already done")
- continue
- posts.write(url + "\n")
- f = urllib.request.urlopen(url)
- soup = BeautifulSoup(f.read().decode('utf-8'))
- content = soup.find("div", class_="md")
- content = content.find("p").string
- curPost = open('./posts/' + title, 'w+')
- curPost.write("========== URL ==========\n")
- curPost.write(url+"\n")
- curPost.write("========== URL ==========\n")
- curPost.write("========== TITLE ==========\n")
- curPost.write(title+"\n")
- curPost.write("========== TITLE ==========\n")
- curPost.write("========== CONTENT ==========\n")
- curPost.write(content+"\n")
- curPost.write("========== CONTENT ==========\n")
- time.sleep(5)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement