Advertisement
Guest User

spam climate confessions with /pol posts

a guest
Sep 20th, 2019
258
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.50 KB | None | 0 0
  1. #!/usr/bin/python
  2. import json
  3. import pickle
  4. import requests
  5. import time
  6. import re
  7. import datetime
  8. import requests
  9. import sys
  10.  
  11.  
  12. class Scrape:
  13.     def __init__(self, board='pol'):
  14.         print("Starting class to scrape: " + board)
  15.         self.verbose = True
  16.         self.board = board
  17.         self.timestamps = []
  18.         self.timestamp = time.strftime('%a, %d %b %Y %H:%M:%S GMT', time.gmtime(time.time()))
  19.         self.timestamps.append(self.timestamp)
  20.         self.catalog = 0 # Current catalog as JSON object
  21.         self.live_threads = [] # JSON objects
  22.         self.live_nums = [] # Current thread numbers
  23.         self.archived_threads = [] # Put dead threads here
  24.  
  25.     def do_http(self,url):
  26.         time.sleep(1)
  27.         return requests.get(url)
  28.  
  29.  
  30.     def get_catalog(self, board=0):
  31.         if board == 0:
  32.             board = self.board
  33.         url = 'https://a.4cdn.org/' + board + '/catalog.json'
  34.         if self.verbose: print('Pulling /' + self.board + '/ catalog: ' + url)
  35.         #headers = {'If-Modified-Since': self.timestamp} # <-- Some wizardry header that I don't think works
  36.         catalog = self.do_http(url)
  37.         self.catalog = json.loads(catalog.text)
  38.         return json.loads(catalog.text)
  39.  
  40.     # Returns list of thread numbers
  41.     def get_thread_nums(self, catalog):
  42.         if self.verbose: print('Gathering thread numbers')
  43.         nums = []
  44.         if self.board == 'pol': # Ignore the sticky...
  45.             sticky = 51971506
  46.         for page in catalog:
  47.             for thread in page['threads']:
  48.                 num = thread['no']
  49.                 if not num == sticky:
  50.                     nums.append(num)
  51.         self.nums = nums
  52.         if self.verbose: print('Found ' + str(len(nums)) + ' threads')
  53.         return nums
  54.       # 11 pages, 15 threads per page
  55.  
  56.     # Returns array of threads
  57.     def get_threads(self):
  58.         self.get_catalog()
  59.         nums = self.get_thread_nums(self.catalog)
  60.         if self.verbose: print('Getting threads')
  61.         threads = []
  62.         #for num in nums:
  63.         i = 50
  64.         while(i > 0):
  65.             url = 'https://a.4cdn.org/pol/thread/' + str(nums[i]) + '.json'
  66.             thread = self.do_http(url)
  67.             thread = json.loads(thread.text)
  68.             print('Got ' + url + ' with ' + str(len(thread['posts'])) + ' posts')
  69.             threads.append(thread)
  70.             i = i - 1
  71.         if self.verbose: print('Finished parsing catalog\'s threads')
  72.         self.live_threads = threads
  73.         return threads
  74.  
  75. def cleanhtml(raw_html):
  76.   cleanr = re.compile('<.*?>')
  77.   cleantext = re.sub(cleanr, '', raw_html)
  78.   return cleantext
  79.  
  80. def cleanhtml2(raw_html):
  81.   cleanr = re.compile('\n')
  82.   cleantext = re.sub(cleanr, '', raw_html)
  83.   return cleantext
  84.  
  85.  
  86.  
  87. def sendContent(content):
  88.   currentDate = datetime.datetime.today().strftime ('%B %e %Y %H:%M:%S -500')
  89.   date = str(currentDate)
  90.   fieldContent = '{"slug":"paper", "date_submitted": "' + date + '", "sentence": "' + content + '", "state":""}'
  91.   #print(str(fieldContent))
  92.   print("test")
  93.   headerContent = {"Content-Type":"application/json", "Origin":"https://www.nbcnews.com", "Referer":"https://www.nbcnews.com/news/specials/climate-confessions-share-solutions-climate-change-n1054791", "Sec-Fetch-Mode":"cors", "User-Agent":"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"}
  94.   r = requests.post("https://election-confessions.nbcnews.com/", data=fieldContent, headers=headerContent)
  95.   print("sent request, status " + str(r.status_code))
  96.   print("response content " + str(r.text) + "\n")
  97.  
  98.  
  99. if(len(sys.argv) == 0):
  100.     print("No provided arguments. Here are your options... \n'python3 scrape.py scrape'\tto scrape 4chan\n'python3 scrape.py spam'\tto spam NBC with 4chan content")
  101. elif(sys.argv[1] == "scrape"):
  102.     print("scraping 4chan. The output will be in 'output.txt")
  103.     a = Scrape('pol')
  104.     threadz = a.get_threads()
  105.     print("parsing threads")
  106.     f = open("output.txt", "a")
  107.     for threadd in threadz:
  108.         try:
  109.             for post in threadd['posts']:
  110.                 f.write(str(cleanhtml(post['com']) + "\n"))
  111.         except:
  112.             pass
  113.     f.close()
  114.     print("finished. Run this again with the 'spam' argument to spam NBC")
  115.  
  116. elif(sys.argv[1] == "spam"):
  117.     print("spamming")
  118.     with open("output.txt", "r") as fp:
  119.         for line in fp:
  120.             sendContent(cleanhtml2(str(line)))
  121. else:
  122.     print("Error - inavlid arguments. Here are your options... \n'python3 scrape.py scrape'\tto scrape 4chan\n'python3 scrape.py spam'\tto spam NBC with 4chan content")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement