Guest User

wordgen

a guest
May 18th, 2014
292
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.57 KB | None | 0 0
  1. ### wordlist generator ###
  2. import re, sys, os, urllib
  3. ### custom useragent   ###
  4. class AppURLopener(urllib.FancyURLopener):
  5.     version = "Mozilla/5.0(compatable;MSIE 9.0; Windows NT 6.1; Trident/5.0)"
  6.  
  7. urllib._urlopener = AppURLopener()
  8. uopen   = urllib.urlopen
  9. uencode = urllib.urlencode
  10.  
  11. ###########################
  12. ###########
  13. ### Helper Function
  14. ###
  15.  
  16. def ls(file):
  17.     print(open(file,'rb').read())
  18.  
  19. def google(query, numget=10, verbose=0):
  20.    
  21.     numget = int(numget)
  22.     start = 0
  23.     results = []
  24.  
  25.     if verbose == 2:
  26.         print("[+]Getting " + str(numget) + " results")
  27.        
  28.         while len(results) < numget:
  29.             print("[+]" + str(len(results)) + " so far...")
  30.             data = uopen("https://www.google.com/search?q="+query+"&star="+str(start))
  31.  
  32.             if data.code != 200:
  33.                 print("Error " + str(data.code))
  34.                 break
  35.  
  36.             results.extend(re.findall("<a href="/%201D([^/%201D]*)/%201D">class=(?:1|s)",data.read()))
  37.             print(data.read())
  38.             start += 10
  39.  
  40.             if verbose == 2:
  41.                 print("[+] Got " + str(numget) + " results")
  42.  
  43.             return results[:numget]
  44.  
  45. def genWordlist(targetlist, word_reg, outfile, verbose=0, quotes=True):
  46.     quote_reg = re.compile("\"([^\"]{2,35})\"")
  47.     ###
  48.     ### Initialize Engine
  49.     ###
  50.     words = []
  51.     append = False
  52.     total_wb = 0
  53.     dircount = 0
  54.     totalcount = 0
  55.     ###
  56.     ### Read the old list
  57.     ###
  58.     if outfile.startswith("+"):
  59.         outfile = outfile[1:]
  60.         words = open(outfile).readlines()
  61.         append = True
  62.         total_wb = len(words)
  63.     ###
  64.     ### Hit the sources
  65.     ###
  66.  
  67.     for target in targetlist:
  68.         data = None
  69.         ###
  70.         ### Get the data
  71.         ###
  72.         if os.path.isfile(target):
  73.             data = open(target).read()
  74.         elif os.path.isdir(target):
  75.             dircount += 1 # for stats in end
  76.             subtargets = os.listdir(target)
  77.  
  78.             for subtarget in subtargets:
  79.                 if os.path.isfile(subtarget):
  80.                     data = "\n\n" + os.read(subtarget)
  81.                 else:
  82.                     targetlist.append(subtarget) # We will get it next time around
  83.         else:
  84.             try:
  85.                 res = uopen(target)
  86.  
  87.                 if res.code != 200:
  88.                     print("[!]Error: " + str(res.code))
  89.                 else:
  90.                     data = res.read()
  91.             except Exception as e:
  92.                 print("[!]" + str(e))
  93.  
  94.         totalcount += 1
  95.  
  96.         if not data:
  97.             if verbose:
  98.                 print("[-]No data from source: " + str(target))
  99.                 continue
  100.             else:
  101.                 if verbose:
  102.                     sys.stdout.write(str(totalcount) + " of ~" + str(len(targetlist)) + " sources complete\r")
  103.                     sys.stdout.flush()
  104.                 else:
  105.                     pass
  106.         ###
  107.         ### Format the data
  108.         ###
  109.         data = re.sub("(<!--|-->)","",data) # keep comments as normal text
  110.         data = re.sub("</?[^>]+>","",data)  # remove html tags
  111.         data = re.sub("\r|\n","",data)      # make it a strait file
  112.         ###
  113.         ### Add the new words
  114.         ###
  115.         allwords = word_reg.findall(data)
  116.         allquotes = quote_reg.findall(data)
  117.  
  118.         for quote in allquotes:
  119.             allwords.append(quote)
  120.             allwords.append(quote.replace(" ",""))
  121.  
  122.             #flw = ''
  123.             #for each in quote.split(''):
  124.                 # if len(each) > 0:
  125.                     #flw += each[0]
  126.                     # if flw:
  127.                         #allwords.append(flw)
  128.  
  129.         for word in allwords:
  130.             ###
  131.             ### Mangle
  132.             ###
  133.  
  134.             if( word.endswith('.') or
  135.                 word.endswith(',') or
  136.                 word.endswith('!') or
  137.                 word.endswith('?') or
  138.                 word.endswith(';') or
  139.                 word.endswith('"') or
  140.                 word.endswith('\'')):
  141.                 allwords.append(word.strip('.,!?;"\''))
  142.  
  143.                 if re.match("\A.*\.(jpg|png|txt|com|html)\Z", word):
  144.                     allwords.append(word.rsplit('.',1)[0])
  145.             ###
  146.             ### Add
  147.             ###
  148.             if not word in words:
  149.                 words.append(word)
  150.  
  151.     total_wa = len(words)
  152.     total_s = len(targetlist)
  153.     words.sort()
  154.     of = open(outfile, 'w')
  155.  
  156.     for word in words:
  157.         of.write(word+"\n")
  158.  
  159.     of.close()
  160.  
  161.     if verbose:
  162.         print("[+]Complete!")
  163.         print("[+]" + str(total_wb) + " words in the list.")
  164.  
  165.         if append:
  166.             print("[+]" + str(total_wa - total_wb) + " are new.")
  167.  
  168.         print("[+]Collected from " + str(total_s - dircount) + " sources.")
  169.  
  170. if __name__ == "__main__":
  171.     ###
  172.     ### User input
  173.     ###
  174.  
  175.     verbose = 2
  176.     minlen = 6
  177.     maxlen = None
  178.     find_quotes = True
  179.  
  180.     wordrules = ["A-z", "A-z0-9","A-z0-9*-.!$#@%"]
  181.     wordrule = None
  182.  
  183.     while not wordrule:
  184.         print("Select a word rule:")
  185.  
  186.         for i,rule in enumerate(wordrules):
  187.             print(str(i + 1) + " -- " + wordrules[i])
  188.             print(str(i + 2) + " Custom (WARNING: ADVANCED!! not validation)")
  189.  
  190.             que = raw_input("Rule[1-" + str(i+2) + "]:")
  191.  
  192.             try:
  193.                 que = int(que.strip())
  194.             except:
  195.                 que = -1
  196.             if que == i+2:
  197.                 wordrule = raw_input("Wordrule:").strip()
  198.             elif que < 1 or que > i+2:
  199.                 print("Not a valid selection")
  200.             else:
  201.                 wordrule = wordrules[i-1]
  202.  
  203.     if not minlen:
  204.         minlen = 3
  205.  
  206.     outfile = raw_input("Filename:")
  207.  
  208.     if os.path.exists(outfile) and not outfile.startswith("+"):
  209.         que = raw_input("[?]This file exists! Overwrite[y|N]:")
  210.         if not 'y' in que.lower():
  211.             exit(0)
  212.  
  213.     targetlist = raw_input("Input target list, separate by ';' no space or quote\n" + "Use %g<query>%<numresults> to use google query sites\n" +
  214.         "Targets:")
  215.     targetlist = targetlist.split(';')
  216.  
  217.     for target in targetlist:
  218.         if re.match("%g[^%]+%[0-9]+", target):
  219.             if verbose == 2:
  220.                 print("[+]Google sources: " + target[2:].split('%')[0])
  221.  
  222.         new_targets = google(target[2:].split("%")[0],target[2:].split("%")[1],verbose)
  223.         targetlist.remove(target)
  224.         targetlist.extend(new_targets)
  225.  
  226.     if verbose == 2:
  227.         print("[+]Gathering data from the following targets:")
  228.  
  229.         for target in targetlist:
  230.             print("[+]" + target)
  231.             print("================================================")
  232.  
  233.     ###
  234.     ### Prepare and call
  235.     ###
  236.     word_reg = re.compile("([" + wordrule + "]{" + str(minlen) + "," + str(maxlen) + "})")
  237.     genWordlist(targetlist,word_reg,outfile,verbose)
Advertisement
Add Comment
Please, Sign In to add comment