Guest User

Untitled

a guest
Jul 9th, 2016
34
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 4.03 KB | None | 0 0
  1. #!/usr/bin/env python2
  2. # Script for dumping json content from 4chan
  3.  
  4. from gevent import monkey
  5. monkey.patch_all()
  6.  
  7. import gevent.pool
  8. import gevent.queue
  9.  
  10. import json
  11. import requests
  12. import sys
  13. import os
  14. import time
  15. import datetime
  16. import csv
  17. import mysql.connector
  18. import re
  19. from randua import randomua
  20.  
  21. class chan_dump():
  22.  
  23. # TODO: Randomized User Agents
  24. def __init__(self):
  25. self.headers={'User-Agent':randomua()}
  26. self.test_list = []
  27.  
  28. def insert_sql(self, dateCrawled, idSection, intThreadno, blobJSON):
  29. cnx = mysql.connector.connect(user='forumcrawler', password='password', database='forumcrawler')
  30. cursor = cnx.cursor()
  31.  
  32. update_sql = ("INSERT INTO tbl_content (dateCrawled, idSection, intThreadno, blobJSON) "
  33. " VALUES (%s, %s, %s, %s)")
  34. update_data = (dateCrawled, idSection, intThreadno, blobJSON)
  35.  
  36. cursor.execute(update_sql, update_data)
  37.  
  38. cnx.commit()
  39.  
  40. cursor.close()
  41. cnx.close()
  42.  
  43. def update_sql(self, dateCrawled, idPost, intThreadno, blobJSON):
  44. cnx = mysql.connector.connect(user='forumcrawler', password='password', database='forumcrawler')
  45. cursor = cnx.cursor()
  46.  
  47. update_service = ("UPDATE tbl_content SET dateCrawled = %s, blobJSON = %s WHERE ((idPost = %s) And (intThreadno = %s))")
  48. update_service_data = (dateCrawled, blobJSON, idPost, intThreadno)
  49. cursor.execute(update_service, update_service_data)
  50.  
  51. cnx.commit()
  52.  
  53. cursor.close()
  54. cnx.close()
  55.  
  56. def fetch_thread(self, threadno):
  57. try:
  58. self.headers={'User-Agent':randomua()}
  59. r = requests.get('http://a.4cdn.org/pol/thread/' + str(threadno) + '.json', headers=self.headers)
  60. return r.text
  61. except:
  62. print 'Unable to connect.'
  63.  
  64. return "0"
  65.  
  66. def run(self):
  67. cnx = mysql.connector.connect(user='forumcrawler', password='password', database='forumcrawler')
  68. cursor = cnx.cursor()
  69. try:
  70. r = requests.get('http://a.4cdn.org/pol/threads.json', headers=self.headers)
  71. data = json.loads(r.text)
  72. except:
  73. print 'Unable to connect.'
  74. return
  75.  
  76. threadno = 0
  77. postdate = 0
  78. cthreads = 0
  79. uthreads = 0
  80.  
  81. # print json.dumps(data, sort_keys=True, indent=4, separators=(',', ': '))
  82. for x in range(0,(len(data)-1)):
  83. for y in range(0, (len(data[x]["threads"])-1)):
  84. if "no" in data[x]["threads"][y]:
  85. threadno = data[x]["threads"][y]["no"]
  86. if "last_modified" in data[x]["threads"][y]:
  87. postdate = data[x]["threads"][y]["last_modified"]
  88. #print datetime.datetime.fromtimestamp(int(data[x]["threads"][y]["last_modified"])).strftime('%Y-%m-%d %H:%M:%S')
  89. select_sql = "SELECT intThreadno, dateCrawled, idPost from tbl_content Where (intThreadno=%s And idSection=1)" % threadno
  90. cursor.execute(select_sql)
  91. existing_thread = cursor.fetchall()
  92.  
  93. # If thread doesn't exist, fetch it and add everything
  94. if not existing_thread:
  95. fetched_thread = self.fetch_thread(threadno)
  96. if fetched_thread != "0":
  97. self.insert_sql(time.strftime('%Y/%m/%d %H:%M:%S'), 1, threadno, fetched_thread)
  98.  
  99. #TOC requires 1 second between thread updates.
  100. time.sleep(2)
  101. cthreads += 1
  102.  
  103. # Else, if modified > dateCrawled, update entry and date crawled
  104. elif (postdate > int(existing_thread[0][1].strftime('%s'))):
  105. #print str(threadno) + " modified."
  106. fetched_thread = self.fetch_thread(threadno)
  107. self.update_sql(time.strftime('%Y/%m/%d %H:%M:%S'), existing_thread[0][2], threadno, fetched_thread)
  108.  
  109. #TOC requires 1 second between thread updates.
  110. time.sleep(2)
  111. uthreads += 1
  112. y+=1
  113. x+=1
  114.  
  115. cursor.close()
  116. cnx.close()
  117. print time.strftime('%Y/%m/%d %H:%M:%S') + ': Created ' + str(cthreads) + ' threads and updated ' + str(uthreads) + ' threads.'
  118.  
  119. # TOC requires 10 seconds between catalog queries.
  120. time.sleep(15)
  121.  
  122.  
  123. R=chan_dump()
  124.  
  125. while True:
  126. try:
  127. R.run()
  128. except KeyboardInterrupt:
  129. print 'Ctrl-C detected -- exiting.'
  130. sys.exit()
Add Comment
Please, Sign In to add comment