Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python3
- from math import ceil
- from optparse import OptionParser
- import os
- import re
- import time
- import sys
- import urllib
- import urllib.request
- from pyquery import PyQuery
- def get_timestamp():
- return time.strftime('%Y/%m/%d %H:%M:%S')
- def all_python_encodings():
- return ["ascii",
- "big5",
- "big5hkscs",
- "euc_jis_2004",
- "euc_jisx0213",
- "euc_kr",
- "gb2312",
- "gbk",
- "gb18030",
- "hz",
- "iso2022_jp",
- "iso2022_jp_1",
- "iso2022_jp_2",
- "iso2022_jp_2004",
- "iso2022_jp_3",
- "iso2022_jp_ext",
- "iso2022_kr",
- "latin_1",
- "iso8859_2",
- "iso8859_3",
- "iso8859_4",
- "iso8859_5",
- "iso8859_6",
- "iso8859_7",
- "iso8859_8",
- "iso8859_9",
- "iso8859_10",
- "iso8859_13",
- "iso8859_14",
- "iso8859_15",
- "iso8859_16",
- "johab",
- "koi8_r",
- "koi8_u",
- "mac_cyrillic",
- "mac_greek",
- "mac_iceland",
- "mac_latin2",
- "mac_roman",
- "mac_turkish",
- "ptcp154",
- "shift_jis",
- "shift_jis_2004",
- "shift_jisx0213",
- "utf_32",
- "utf_32_be",
- "utf_32_le",
- "utf_16",
- "utf_16_be",
- "utf_16_le",
- "utf_7",
- "utf_8",
- "utf_8_sig"]
- class Logger:
- shell_mod = {
- '':'',
- 'PURPLE' : '\033[95m',
- 'CYAN' : '\033[96m',
- 'DARKCYAN' : '\033[36m',
- 'BLUE' : '\033[94m',
- 'GREEN' : '\033[92m',
- 'YELLOW' : '\033[93m',
- 'RED' : '\033[91m',
- 'BOLD' : '\033[1m',
- 'UNDERLINE' : '\033[4m',
- 'RESET' : '\033[0m'
- }
- def log ( self, message, is_bold=False, color='', log_time=True):
- prefix = ''
- suffix = ''
- if log_time:
- prefix += '[{:s}] '.format(get_timestamp())
- if os.name == 'posix':
- if is_bold:
- prefix += self.shell_mod['BOLD']
- prefix += self.shell_mod[color.upper()]
- suffix = self.shell_mod['RESET']
- message = prefix + message + suffix
- print ( message )
- sys.stdout.flush()
- def error(self, err):
- self.log(err, True, 'RED')
- def fatal_error(self, err):
- self.error(err)
- exit()
- class Crawler:
- Url = 'http://pastebin.com'
- PASTES_URL = Url + '/archive'
- Username = 'haker_hater'
- Password = 'Pasw0Rth123'
- REGEXES_FILE = 'regexes.txt'
- OK = 1
- ACCESS_DENIED = -1
- CONNECTION_FAIL = -2
- OTHER_ERROR = -3
- prev_checked_ids = []
- new_checked_ids = []
- def read_regexes(self):
- try:
- with open ( self.REGEXES_FILE, 'r') as f:
- try:
- self.regexes = [ [ field.strip() for field in line.split(',')] for line in f.readlines() if line.strip() != '' and not line.startswith('#')]
- # In case commas exist in the regexes...merge everything.
- for i in range(len(self.regexes)):
- self.regexes[i] = [','.join(self.regexes[i][:-2])] + self.regexes[i][-2:]
- except KeyboardInterrupt:
- raise
- except:
- Logger().fatal_error('Malformed regexes file. Format: regex_pattern,URL logging file, directory logging file.')
- except KeyboardInterrupt:
- raise
- except:
- Logger().fatal_error('{:s} not found or not acessible.'.format(self.REGEXES_FILE))
- def __init__(self):
- self.read_regexes()
- def get_pastes ( self ):
- Logger ().log ( 'Getting pastes', True )
- try:
- page = PyQuery ( url = self.PASTES_URL )
- except KeyboardInterrupt:
- raise
- except:
- return self.CONNECTION_FAIL,None
- try:
- page_html = page.html ()
- except KeyboardInterrupt:
- raise
- except:
- worked = False
- for enc in all_python_encodings():
- try:
- page_html = page.html(encoding=enc)
- worked = True
- break
- except KeyboardInterrupt:
- raise
- except:
- pass
- if not worked:
- # One last try...
- try:
- f = urllib.request.urlopen(Crawler.PASTES_URL)
- page_html = PyQuery(str(f.read()).encode('utf8')).html()
- f.close()
- except KeyboardInterrupt:
- raise
- except:
- return self.OTHER_ERROR, None
- if re.match ( r'Pastebin\.com - Access Denied Warning', page_html, re.IGNORECASE ) or 'blocked your IP' in page_html:
- return self.ACCESS_DENIED,None
- else:
- return self.OK,page('.maintable img').next('a')
- def check_paste ( self, paste_id ):
- paste_url = self.Url + paste_id
- credentials = Username + Password
- try:
- paste_txt = PyQuery ( url = paste_url ,cookie = credentials)('#paste_code').text()
- for regex,file,directory in self.regexes:
- if re.match ( regex, paste_txt, re.IGNORECASE ):
- Logger ().log ( 'Found a matching paste: ' + paste_url + ' (' + file + ')', True, 'CYAN' )
- self.save_result ( paste_url,paste_id,file,directory )
- return True
- Logger ().log ( 'Not matching paste: ' + paste_url )
- except KeyboardInterrupt:
- raise
- except:
- Logger ().log ( 'Error reading paste (probably a 404 or encoding issue).', True, 'YELLOW')
- return False
- def save_result ( self, paste_url, paste_id, file, directory ):
- timestamp = get_timestamp()
- with open ( file, 'a' ) as matching:
- matching.write ( timestamp + ' - ' + paste_url + '\n' )
- try:
- os.mkdir(directory)
- except KeyboardInterrupt:
- raise
- except:
- pass
- with open( directory + '/' + timestamp.replace('/','_').replace(':','_').replace(' ','__') + '_' + paste_id.replace('/','') + '.txt', mode='w' ) as paste:
- paste_txt = PyQuery(url=paste_url)('#paste_code').text()
- paste.write(paste_txt + '\n')
- def start ( self, refresh_time = 30, delay = 1, ban_wait = 5, flush_after_x_refreshes=100, connection_timeout=60 ):
- count = 0
- while True:
- status,pastes = self.get_pastes ()
- start_time = time.time()
- if status == self.OK:
- for paste in pastes:
- paste_id = PyQuery ( paste ).attr('href')
- self.new_checked_ids.append ( paste_id )
- if paste_id not in self.prev_checked_ids:
- self.check_paste ( paste_id )
- time.sleep ( delay )
- count += 1
- if count == flush_after_x_refreshes:
- self.prev_checked_ids = self.new_checked_ids
- count = 0
- else:
- self.prev_checked_ids += self.new_checked_ids
- self.new_checked_ids = []
- elapsed_time = time.time() - start_time
- sleep_time = ceil(max(0,(refresh_time - elapsed_time)))
- if sleep_time > 0:
- Logger().log('Waiting {:d} seconds to refresh...'.format(sleep_time), True)
- time.sleep ( sleep_time )
- elif status == self.ACCESS_DENIED:
- Logger ().log ( 'Damn! It looks like you have been banned (probably temporarily)', True, 'YELLOW' )
- for n in range ( 0, ban_wait ):
- Logger ().log ( 'Please wait ' + str ( ban_wait - n ) + ' minute' + ( 's' if ( ban_wait - n ) > 1 else '' ) )
- time.sleep ( 60 )
- elif status == self.CONNECTION_FAIL:
- Logger().log ( 'Connection down. Waiting {:d} seconds and trying again'.format(connection_timeout), True, 'RED')
- time.sleep(connection_timeout)
- elif status == self.OTHER_ERROR:
- Logger().log('Unknown error. Maybe an encoding problem? Trying again.'.format(connection_timeout), True,'RED')
- time.sleep(1)
- def parse_input():
- parser = OptionParser()
- parser.add_option('-r', '--refresh-time', help='Set the refresh time (default: 30)', dest='refresh_time', type='int', default=30)
- parser.add_option('-d', '--delay-time', help='Set the delay time (default: 1)', dest='delay', type='float', default=1)
- parser.add_option('-b', '--ban-wait-time', help='Set the ban wait time (default: 5)', dest='ban_wait', type='int', default=5)
- parser.add_option('-f', '--flush-after-x-refreshes', help='Set the number of refreshes after which memory is flushed (default: 100)', dest='flush_after_x_refreshes', type='int', default=100)
- parser.add_option('-c', '--connection-timeout', help='Set the connection timeout waiting time (default: 60)', dest='connection_timeout', type='float', default=60)
- (options, args) = parser.parse_args()
- return options.refresh_time, options.delay, options.ban_wait, options.flush_after_x_refreshes, options.connection_timeout
- try:
- Crawler ().start (*parse_input())
- except KeyboardInterrupt:
- Logger ().log ( 'Bye! Hope you found what you were looking for :)', True )
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement