Advertisement
Guest User

Untitled

a guest
Feb 10th, 2018
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 9.86 KB | None | 0 0
  1. #!/usr/bin/env python3
  2. from math import ceil
  3. from optparse import OptionParser
  4. import os
  5. import re
  6. import time
  7. import sys
  8. import urllib
  9. import urllib.request
  10.  
  11. from pyquery import PyQuery
  12.  
  13. def get_timestamp():
  14. return time.strftime('%Y/%m/%d %H:%M:%S')
  15.  
  16. def all_python_encodings():
  17. return ["ascii",
  18. "big5",
  19. "big5hkscs",
  20. "euc_jis_2004",
  21. "euc_jisx0213",
  22. "euc_kr",
  23. "gb2312",
  24. "gbk",
  25. "gb18030",
  26. "hz",
  27. "iso2022_jp",
  28. "iso2022_jp_1",
  29. "iso2022_jp_2",
  30. "iso2022_jp_2004",
  31. "iso2022_jp_3",
  32. "iso2022_jp_ext",
  33. "iso2022_kr",
  34. "latin_1",
  35. "iso8859_2",
  36. "iso8859_3",
  37. "iso8859_4",
  38. "iso8859_5",
  39. "iso8859_6",
  40. "iso8859_7",
  41. "iso8859_8",
  42. "iso8859_9",
  43. "iso8859_10",
  44. "iso8859_13",
  45. "iso8859_14",
  46. "iso8859_15",
  47. "iso8859_16",
  48. "johab",
  49. "koi8_r",
  50. "koi8_u",
  51. "mac_cyrillic",
  52. "mac_greek",
  53. "mac_iceland",
  54. "mac_latin2",
  55. "mac_roman",
  56. "mac_turkish",
  57. "ptcp154",
  58. "shift_jis",
  59. "shift_jis_2004",
  60. "shift_jisx0213",
  61. "utf_32",
  62. "utf_32_be",
  63. "utf_32_le",
  64. "utf_16",
  65. "utf_16_be",
  66. "utf_16_le",
  67. "utf_7",
  68. "utf_8",
  69. "utf_8_sig"]
  70.  
  71.  
  72. class Logger:
  73.  
  74. shell_mod = {
  75. '':'',
  76. 'PURPLE' : '\033[95m',
  77. 'CYAN' : '\033[96m',
  78. 'DARKCYAN' : '\033[36m',
  79. 'BLUE' : '\033[94m',
  80. 'GREEN' : '\033[92m',
  81. 'YELLOW' : '\033[93m',
  82. 'RED' : '\033[91m',
  83. 'BOLD' : '\033[1m',
  84. 'UNDERLINE' : '\033[4m',
  85. 'RESET' : '\033[0m'
  86. }
  87.  
  88. def log ( self, message, is_bold=False, color='', log_time=True):
  89. prefix = ''
  90. suffix = ''
  91.  
  92. if log_time:
  93. prefix += '[{:s}] '.format(get_timestamp())
  94.  
  95. if os.name == 'posix':
  96. if is_bold:
  97. prefix += self.shell_mod['BOLD']
  98. prefix += self.shell_mod[color.upper()]
  99.  
  100. suffix = self.shell_mod['RESET']
  101.  
  102. message = prefix + message + suffix
  103. print ( message )
  104. sys.stdout.flush()
  105.  
  106. def error(self, err):
  107. self.log(err, True, 'RED')
  108.  
  109. def fatal_error(self, err):
  110. self.error(err)
  111. exit()
  112.  
  113. class Crawler:
  114.  
  115. Url = 'http://pastebin.com'
  116. PASTES_URL = Url + '/archive'
  117. Username = 'haker_hater'
  118. Password = 'Pasw0Rth123'
  119. REGEXES_FILE = 'regexes.txt'
  120. OK = 1
  121. ACCESS_DENIED = -1
  122. CONNECTION_FAIL = -2
  123. OTHER_ERROR = -3
  124.  
  125. prev_checked_ids = []
  126. new_checked_ids = []
  127.  
  128. def read_regexes(self):
  129. try:
  130. with open ( self.REGEXES_FILE, 'r') as f:
  131. try:
  132. self.regexes = [ [ field.strip() for field in line.split(',')] for line in f.readlines() if line.strip() != '' and not line.startswith('#')]
  133.  
  134. # In case commas exist in the regexes...merge everything.
  135. for i in range(len(self.regexes)):
  136. self.regexes[i] = [','.join(self.regexes[i][:-2])] + self.regexes[i][-2:]
  137. except KeyboardInterrupt:
  138. raise
  139. except:
  140. Logger().fatal_error('Malformed regexes file. Format: regex_pattern,URL logging file, directory logging file.')
  141. except KeyboardInterrupt:
  142. raise
  143. except:
  144. Logger().fatal_error('{:s} not found or not acessible.'.format(self.REGEXES_FILE))
  145.  
  146.  
  147. def __init__(self):
  148. self.read_regexes()
  149.  
  150.  
  151.  
  152. def get_pastes ( self ):
  153. Logger ().log ( 'Getting pastes', True )
  154. try:
  155. page = PyQuery ( url = self.PASTES_URL )
  156. except KeyboardInterrupt:
  157. raise
  158. except:
  159. return self.CONNECTION_FAIL,None
  160.  
  161.  
  162. try:
  163. page_html = page.html ()
  164. except KeyboardInterrupt:
  165. raise
  166. except:
  167. worked = False
  168. for enc in all_python_encodings():
  169. try:
  170. page_html = page.html(encoding=enc)
  171. worked = True
  172. break
  173. except KeyboardInterrupt:
  174. raise
  175. except:
  176. pass
  177. if not worked:
  178. # One last try...
  179. try:
  180. f = urllib.request.urlopen(Crawler.PASTES_URL)
  181. page_html = PyQuery(str(f.read()).encode('utf8')).html()
  182. f.close()
  183. except KeyboardInterrupt:
  184. raise
  185. except:
  186. return self.OTHER_ERROR, None
  187. if re.match ( r'Pastebin\.com - Access Denied Warning', page_html, re.IGNORECASE ) or 'blocked your IP' in page_html:
  188. return self.ACCESS_DENIED,None
  189. else:
  190. return self.OK,page('.maintable img').next('a')
  191.  
  192. def check_paste ( self, paste_id ):
  193. paste_url = self.Url + paste_id
  194. credentials = Username + Password
  195. try:
  196. paste_txt = PyQuery ( url = paste_url ,cookie = credentials)('#paste_code').text()
  197.  
  198. for regex,file,directory in self.regexes:
  199. if re.match ( regex, paste_txt, re.IGNORECASE ):
  200. Logger ().log ( 'Found a matching paste: ' + paste_url + ' (' + file + ')', True, 'CYAN' )
  201. self.save_result ( paste_url,paste_id,file,directory )
  202. return True
  203. Logger ().log ( 'Not matching paste: ' + paste_url )
  204. except KeyboardInterrupt:
  205. raise
  206. except:
  207. Logger ().log ( 'Error reading paste (probably a 404 or encoding issue).', True, 'YELLOW')
  208. return False
  209.  
  210. def save_result ( self, paste_url, paste_id, file, directory ):
  211. timestamp = get_timestamp()
  212. with open ( file, 'a' ) as matching:
  213. matching.write ( timestamp + ' - ' + paste_url + '\n' )
  214.  
  215. try:
  216. os.mkdir(directory)
  217. except KeyboardInterrupt:
  218. raise
  219. except:
  220. pass
  221.  
  222. with open( directory + '/' + timestamp.replace('/','_').replace(':','_').replace(' ','__') + '_' + paste_id.replace('/','') + '.txt', mode='w' ) as paste:
  223. paste_txt = PyQuery(url=paste_url)('#paste_code').text()
  224. paste.write(paste_txt + '\n')
  225.  
  226.  
  227. def start ( self, refresh_time = 30, delay = 1, ban_wait = 5, flush_after_x_refreshes=100, connection_timeout=60 ):
  228. count = 0
  229. while True:
  230. status,pastes = self.get_pastes ()
  231.  
  232. start_time = time.time()
  233. if status == self.OK:
  234. for paste in pastes:
  235. paste_id = PyQuery ( paste ).attr('href')
  236. self.new_checked_ids.append ( paste_id )
  237. if paste_id not in self.prev_checked_ids:
  238. self.check_paste ( paste_id )
  239. time.sleep ( delay )
  240. count += 1
  241.  
  242. if count == flush_after_x_refreshes:
  243. self.prev_checked_ids = self.new_checked_ids
  244. count = 0
  245. else:
  246. self.prev_checked_ids += self.new_checked_ids
  247. self.new_checked_ids = []
  248.  
  249. elapsed_time = time.time() - start_time
  250. sleep_time = ceil(max(0,(refresh_time - elapsed_time)))
  251. if sleep_time > 0:
  252. Logger().log('Waiting {:d} seconds to refresh...'.format(sleep_time), True)
  253. time.sleep ( sleep_time )
  254. elif status == self.ACCESS_DENIED:
  255. Logger ().log ( 'Damn! It looks like you have been banned (probably temporarily)', True, 'YELLOW' )
  256. for n in range ( 0, ban_wait ):
  257. Logger ().log ( 'Please wait ' + str ( ban_wait - n ) + ' minute' + ( 's' if ( ban_wait - n ) > 1 else '' ) )
  258. time.sleep ( 60 )
  259. elif status == self.CONNECTION_FAIL:
  260. Logger().log ( 'Connection down. Waiting {:d} seconds and trying again'.format(connection_timeout), True, 'RED')
  261. time.sleep(connection_timeout)
  262. elif status == self.OTHER_ERROR:
  263. Logger().log('Unknown error. Maybe an encoding problem? Trying again.'.format(connection_timeout), True,'RED')
  264. time.sleep(1)
  265.  
  266. def parse_input():
  267. parser = OptionParser()
  268. parser.add_option('-r', '--refresh-time', help='Set the refresh time (default: 30)', dest='refresh_time', type='int', default=30)
  269. parser.add_option('-d', '--delay-time', help='Set the delay time (default: 1)', dest='delay', type='float', default=1)
  270. parser.add_option('-b', '--ban-wait-time', help='Set the ban wait time (default: 5)', dest='ban_wait', type='int', default=5)
  271. parser.add_option('-f', '--flush-after-x-refreshes', help='Set the number of refreshes after which memory is flushed (default: 100)', dest='flush_after_x_refreshes', type='int', default=100)
  272. parser.add_option('-c', '--connection-timeout', help='Set the connection timeout waiting time (default: 60)', dest='connection_timeout', type='float', default=60)
  273. (options, args) = parser.parse_args()
  274. return options.refresh_time, options.delay, options.ban_wait, options.flush_after_x_refreshes, options.connection_timeout
  275.  
  276.  
  277. try:
  278. Crawler ().start (*parse_input())
  279. except KeyboardInterrupt:
  280. Logger ().log ( 'Bye! Hope you found what you were looking for :)', True )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement