Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python2
- # Script for dumping json content from 4chan
- from gevent import monkey
- monkey.patch_all()
- import gevent.pool
- import gevent.queue
- import json
- import requests
- import sys
- import os
- import time
- import datetime
- import csv
- import mysql.connector
- import re
- from randua import randomua
- class chan_dump():
- # TODO: Randomized User Agents
- def __init__(self):
- self.headers={'User-Agent':randomua()}
- self.test_list = []
- def insert_sql(self, dateCrawled, idSection, intThreadno, blobJSON):
- cnx = mysql.connector.connect(user='forumcrawler', password='password', database='forumcrawler')
- cursor = cnx.cursor()
- update_sql = ("INSERT INTO tbl_content (dateCrawled, idSection, intThreadno, blobJSON) "
- " VALUES (%s, %s, %s, %s)")
- update_data = (dateCrawled, idSection, intThreadno, blobJSON)
- cursor.execute(update_sql, update_data)
- cnx.commit()
- cursor.close()
- cnx.close()
- def update_sql(self, dateCrawled, idPost, intThreadno, blobJSON):
- cnx = mysql.connector.connect(user='forumcrawler', password='password', database='forumcrawler')
- cursor = cnx.cursor()
- update_service = ("UPDATE tbl_content SET dateCrawled = %s, blobJSON = %s WHERE ((idPost = %s) And (intThreadno = %s))")
- update_service_data = (dateCrawled, blobJSON, idPost, intThreadno)
- cursor.execute(update_service, update_service_data)
- cnx.commit()
- cursor.close()
- cnx.close()
- def fetch_thread(self, threadno):
- try:
- self.headers={'User-Agent':randomua()}
- r = requests.get('http://a.4cdn.org/pol/thread/' + str(threadno) + '.json', headers=self.headers)
- return r.text
- except:
- print 'Unable to connect.'
- return "0"
- def run(self):
- cnx = mysql.connector.connect(user='forumcrawler', password='password', database='forumcrawler')
- cursor = cnx.cursor()
- try:
- r = requests.get('http://a.4cdn.org/pol/threads.json', headers=self.headers)
- data = json.loads(r.text)
- except:
- print 'Unable to connect.'
- return
- threadno = 0
- postdate = 0
- cthreads = 0
- uthreads = 0
- # print json.dumps(data, sort_keys=True, indent=4, separators=(',', ': '))
- for x in range(0,(len(data)-1)):
- for y in range(0, (len(data[x]["threads"])-1)):
- if "no" in data[x]["threads"][y]:
- threadno = data[x]["threads"][y]["no"]
- if "last_modified" in data[x]["threads"][y]:
- postdate = data[x]["threads"][y]["last_modified"]
- #print datetime.datetime.fromtimestamp(int(data[x]["threads"][y]["last_modified"])).strftime('%Y-%m-%d %H:%M:%S')
- select_sql = "SELECT intThreadno, dateCrawled, idPost from tbl_content Where (intThreadno=%s And idSection=1)" % threadno
- cursor.execute(select_sql)
- existing_thread = cursor.fetchall()
- # If thread doesn't exist, fetch it and add everything
- if not existing_thread:
- fetched_thread = self.fetch_thread(threadno)
- if fetched_thread != "0":
- self.insert_sql(time.strftime('%Y/%m/%d %H:%M:%S'), 1, threadno, fetched_thread)
- #TOC requires 1 second between thread updates.
- time.sleep(2)
- cthreads += 1
- # Else, if modified > dateCrawled, update entry and date crawled
- elif (postdate > int(existing_thread[0][1].strftime('%s'))):
- #print str(threadno) + " modified."
- fetched_thread = self.fetch_thread(threadno)
- self.update_sql(time.strftime('%Y/%m/%d %H:%M:%S'), existing_thread[0][2], threadno, fetched_thread)
- #TOC requires 1 second between thread updates.
- time.sleep(2)
- uthreads += 1
- y+=1
- x+=1
- cursor.close()
- cnx.close()
- print time.strftime('%Y/%m/%d %H:%M:%S') + ': Created ' + str(cthreads) + ' threads and updated ' + str(uthreads) + ' threads.'
- # TOC requires 10 seconds between catalog queries.
- time.sleep(15)
- R=chan_dump()
- while True:
- try:
- R.run()
- except KeyboardInterrupt:
- print 'Ctrl-C detected -- exiting.'
- sys.exit()
Add Comment
Please, Sign In to add comment