Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/python
- import json
- import pickle
- import requests
- import time
- import re
- import datetime
- import requests
- import sys
- class Scrape:
- def __init__(self, board='pol'):
- print("Starting class to scrape: " + board)
- self.verbose = True
- self.board = board
- self.timestamps = []
- self.timestamp = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(time.time()))
- self.timestamps.append(self.timestamp)
- self.catalog = 0 # Current catalog as JSON object
- self.live_threads = [] # JSON objects
- self.live_nums = [] # Current thread numbers
- self.archived_threads = [] # Put dead threads here
- def do_http(self,url):
- time.sleep(1)
- return requests.get(url)
- def get_catalog(self, board=0):
- if board == 0:
- board = self.board
- url = 'https://a.4cdn.org/' + board + '/catalog.json'
- if self.verbose: print('Pulling /' + self.board + '/ catalog: ' + url)
- #headers = {'If-Modified-Since': self.timestamp} # <-- Some wizardry header that I don't think works
- catalog = self.do_http(url)
- self.catalog = json.loads(catalog.text)
- return json.loads(catalog.text)
- # Returns list of thread numbers
- def get_thread_nums(self, catalog):
- if self.verbose: print('Gathering thread numbers')
- nums = []
- if self.board == 'pol': # Ignore the sticky...
- sticky = 51971506
- for page in catalog:
- for thread in page['threads']:
- num = thread['no']
- if not num == sticky:
- nums.append(num)
- self.nums = nums
- if self.verbose: print('Found ' + str(len(nums)) + ' threads')
- return nums
- # 11 pages, 15 threads per page
- # Returns array of threads
- def get_threads(self):
- self.get_catalog()
- nums = self.get_thread_nums(self.catalog)
- if self.verbose: print('Getting threads')
- threads = []
- #for num in nums:
- i = 50
- while(i > 0):
- url = 'https://a.4cdn.org/pol/thread/' + str(nums[i]) + '.json'
- thread = self.do_http(url)
- thread = json.loads(thread.text)
- print('Got ' + url + ' with ' + str(len(thread['posts'])) + ' posts')
- threads.append(thread)
- i = i - 1
- if self.verbose: print('Finished parsing catalog\'s threads')
- self.live_threads = threads
- return threads
- def cleanhtml(raw_html):
- cleanr = re.compile('<.*?>')
- cleantext = re.sub(cleanr, '', raw_html)
- return cleantext
- def cleanhtml2(raw_html):
- cleanr = re.compile('\n')
- cleantext = re.sub(cleanr, '', raw_html)
- return cleantext
- def sendContent(content):
- currentDate = datetime.datetime.today().strftime ('%B %e %Y %H:%M:%S -500')
- date = str(currentDate)
- fieldContent = '{"slug":"paper", "date_submitted": "' + date + '", "sentence": "' + content + '", "state":""}'
- #print(str(fieldContent))
- print("test")
- headerContent = {"Content-Type":"application/json", "Origin":"https://www.nbcnews.com", "Referer":"https://www.nbcnews.com/news/specials/climate-confessions-share-solutions-climate-change-n1054791", "Sec-Fetch-Mode":"cors", "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"}
- r = requests.post("https://election-confessions.nbcnews.com/", data=fieldContent, headers=headerContent)
- print("sent request, status " + str(r.status_code))
- print("response content " + str(r.text) + "\n")
- if(len(sys.argv) == 0):
- print("No provided arguments. Here are your options... \n'python3 scrape.py scrape'\tto scrape 4chan\n'python3 scrape.py spam'\tto spam NBC with 4chan content")
- elif(sys.argv[1] == "scrape"):
- print("scraping 4chan. The output will be in 'output.txt")
- a = Scrape('pol')
- threadz = a.get_threads()
- print("parsing threads")
- f = open("output.txt", "a")
- for threadd in threadz:
- try:
- for post in threadd['posts']:
- f.write(str(cleanhtml(post['com']) + "\n"))
- except:
- pass
- f.close()
- print("finished. Run this again with the 'spam' argument to spam NBC")
- elif(sys.argv[1] == "spam"):
- print("spamming")
- with open("output.txt", "r") as fp:
- for line in fp:
- sendContent(cleanhtml2(str(line)))
- else:
- print("Error - inavlid arguments. Here are your options... \n'python3 scrape.py scrape'\tto scrape 4chan\n'python3 scrape.py spam'\tto spam NBC with 4chan content")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement