zeerx7

commoncrawl graber

Nov 4th, 2020 (edited)
122
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 1.89 KB | None | 0 0
  1. import re,requests, time
  2. from urlparse import urlparse
  3. #karena susah sekali buat download data JSON nya
  4. #mendingan pake ini aja :D
  5.  
  6. # Sorry Acak2an gan
  7. # Dah Males buat tools ginian soalnya :"
  8.  
  9. # Kalo mau recorde,,, ya tau diri lah :D
  10. print """
  11.  
  12.  
  13.  
  14.  
  15.  urlsearch.commoncrawl.org Grabber
  16.  
  17.  Coded by Zeerx7 # XploitSec ID
  18.  
  19. """
  20. print 'Masukan Domain! (ex: jp)'
  21. d = raw_input('Domain: ')                                                                            #d = 'uk'
  22. u = 'http://urlsearch.commoncrawl.org/?q='+d+'&start='
  23. tmp = []
  24. def x():                                                                                              #co = count_page() #karena sering 502, mendingan gk usah di pakai function ini!                      print 'Total/jumlah halaman Yang akan di grab, (ex: 10000)'
  25.  co = raw_input('Total Page: ')
  26.  print 'domain: %s Total Page Yang akan di grab: %s' % (d,co)
  27.  time.sleep(2)
  28.  print 'start'
  29.  #exit()
  30.  for o in range(int(co)):
  31.   print 'page: %s' % (o)
  32.   try:
  33.    #exit()
  34.    rum = 0+(int(o)-1)*20 #rumus suku ke N bjir, untung w nasih inget rumusnya :v
  35.    z = requests.get(u+str(rum))
  36.    f = z.text.replace('\n','')
  37.    #print f
  38.    r = re.findall('http(.+?)</a></li>',f)
  39.    #print r
  40.    for j in r:
  41.     if 'Content-Type' in j:pass
  42.     else:
  43.        uu = 'http'+j
  44.        uuu = urlparse(uu).netloc
  45.        if uuu in tmp:pass
  46.        else:
  47.            save('domains.txt',uuu)
  48.            tmp.append(uuu)
  49.            print uuu+' [Saved!]'
  50.        print uu
  51.    #print tmp
  52.    print z.status_code
  53.   except:pass
  54.  
  55. def count_page():
  56.    c = u+str(999999999)
  57.    p = requests.get(c)
  58.    #print p.text.replace('\n','')
  59.    #print c
  60.    h = re.findall('">(.+?)</a>',p.text)
  61.    b = len(h)
  62.    #print h
  63.    #print b
  64.    #print b-1
  65.    print c
  66.    print p.status_code
  67.    return h[b-2]
  68. def save(a,b):
  69.   fx = open(a, "a")
  70.   fx.write(b+"\n")
  71.   fx.close()
  72. ##print count_page()
  73. x()
Add Comment
Please, Sign In to add comment