Advertisement
long_term

Bing Grabber

Jul 22nd, 2014
2,284
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.32 KB | None | 0 0
  1. import urllib2 , urllib
  2. import random
  3. import re
  4. import sys
  5. import threading
  6. class extracter:
  7.     def __init__(self):
  8.         self.linkl     = []
  9.         self.useragent = ['Mozilla/4.0 (compatible; MSIE 5.0; SunOS 5.10 sun4u; X11)',
  10.                   'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.2.2pre) Gecko/20100207 Ubuntu/9.04 (jaunty) Namoroka/3.6.2pre',
  11.                   'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser;',
  12.               'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT 5.0)',
  13.               'Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.1)',
  14.               'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US; rv:1.9.0.6)',
  15.               'Microsoft Internet Explorer/4.0b1 (Windows 95)',
  16.               'Mozilla/4.0 (compatible; MSIE 5.0; AOL 4.0; Windows 95; c_athome)',
  17.               'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)',
  18.               'Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)',
  19.               'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; ZoomSpider.net bot; .NET CLR 1.1.4322)',
  20.               'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; QihooBot 1.0 [email protected])',
  21.               'Mozilla/4.0 (compatible; MSIE 5.0; Windows ME) Opera 5.11 [en]']
  22.     def fetcher(self,url,semaphore,counter,lenlinks,tip):
  23.         try:
  24.             data = urllib2.Request(url,headers={"User-Agent": random.choice(self.useragent)})
  25.             data = urllib2.urlopen(data).read()
  26.             regex= re.compile('h3><a href="(.*?)" h=')
  27.             links= regex.findall(data)
  28.             for link in links:
  29.                 if key:
  30.                     if re.search(key,link):
  31.                        link = link.split("/")
  32.                        link = "http://"+link[2]
  33.                        link = link.replace("www.","")
  34.                        if not re.search("[0-9]\.[0-9]|php|msn|drupal|e2bn|microsoft|soundcloud|wp-plugins|wpguide|godaddy|fan-sites|godaddy|zone-h|osvdb|aa.org|aa.se|inspcloud|mondounix|yellowpages|iamtheproudownerofthelongestlongestlongestdomainnameinthisworld|cnet.com|encyclopedia|go.com|thepiratebay|wpbeginner|tripod|infospace|linkedin|ovh.net|a9.com|exploit|logo|music|altavista|github|gamesville|whowhere|gigablast|stackoverflow|teoma|download|wik|dictionary|theme|free|video|startpagina|startgoogle|lygo|dogpile|secure|security|hack|myspace|conduit|amfibi|lycos|blekko|metacrawler|exactseek|bing|dmoz|pathfinder|feedback|live\.com|w3|aol|yahoo|ask\.com|youtube|twitter|google|facebook|blogspot|wiki|sourceforge|phpmyadmin|forum|blog|share|wordpress|pastebin|4shared|tracker|yahoo|python|host|lib|app|yandex|wphelp|helpcenter|digitalsports",link) and link not in self.linkl:
  35.                             self.linkl.append(link)
  36.                             self.save(link)
  37.                     sys.stdout.write("\rKeyword: %s || Links: %s || Fetching %s of %s" % (key,len(self.linkl),counter,lenlinks))
  38.                     sys.stdout.flush()
  39.                 else:
  40.                    link = link.split("/")
  41.                    link = "http://"+link[2]
  42.                    link = link.replace("www.","")
  43.                    if not re.search("[0-9]\.[0-9]|php|msn|drupal|e2bn|microsoft|soundcloud|wp-plugins|wpguide|godaddy|fan-sites|godaddy|zone-h|osvdb|aa.org|aa.se|inspcloud|mondounix|yellowpages|iamtheproudownerofthelongestlongestlongestdomainnameinthisworld|cnet.com|encyclopedia|go.com|thepiratebay|wpbeginner|tripod|infospace|linkedin|ovh.net|a9.com|exploit|logo|music|altavista|github|gamesville|whowhere|gigablast|stackoverflow|teoma|download|wik|dictionary|theme|free|video|startpagina|startgoogle|lygo|dogpile|secure|security|hack|myspace|conduit|amfibi|lycos|blekko|metacrawler|exactseek|bing|dmoz|pathfinder|feedback|live\.com|w3|aol|yahoo|ask\.com|youtube|twitter|google|facebook|blogspot|wiki|sourceforge|phpmyadmin|forum|blog|share|wordpress|pastebin|4shared|tracker|yahoo|python|host|lib|app|yandex|wphelp|helpcenter|digitalsports",link) and link not in self.linkl:
  44.                         self.linkl.append(link)
  45.                         self.save(link)
  46.                    sys.stdout.write("\rLinks: %s || Fetching %s of %s" % (len(self.linkl),counter,lenlinks))
  47.                    sys.stdout.flush()
  48.         except:
  49.             pass
  50.         semaphore.release()
  51.     def save(self,link):
  52.         try:
  53.             check = open('bing.txt').read()
  54.             if re.search(link,check):
  55.                return False
  56.             else:
  57.                write = open('bing.txt','ab')
  58.                write.write(link+"\r\n")
  59.                write.close()
  60.         except IOError:
  61.             create = open('bing.txt','ab')
  62. extract   = extracter()
  63. dorks     = open(sys.argv[1]).readlines()
  64. semaphore = threading.BoundedSemaphore(value=int(sys.argv[2]))
  65. try:
  66.     key       = sys.argv[3]
  67. except:
  68.     key       = ""
  69. url_list  = []
  70. for dork in dorks:
  71.     dork = urllib.quote_plus(dork.strip())
  72.     i = 1
  73.     while i <= 201:
  74.         url_list.append("http://www.bing.com/search?q="+dork+"&count=50&first="+str(i))
  75.         i += 50
  76. print "[ + ] %s url created for fetch" % len(url_list)
  77. threadlist = []
  78. counter    = 1
  79. for url in url_list:
  80.     semaphore.acquire()
  81.     thrd = threading.Thread(target=extract.fetcher,args=(url,semaphore,counter,len(url_list),key,))
  82.     thrd.start()
  83.     threadlist.append(thrd)
  84.     counter += 1
  85. for t in threadlist:
  86.     t.join()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement