Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- ### wordlist generator ###
- import re, sys, os, urllib
- ### custom useragent ###
- class AppURLopener(urllib.FancyURLopener):
- version = "Mozilla/5.0(compatable;MSIE 9.0; Windows NT 6.1; Trident/5.0)"
- urllib._urlopener = AppURLopener()
- uopen = urllib.urlopen
- uencode = urllib.urlencode
- ###########################
- ###########
- ### Helper Function
- ###
- def ls(file):
- print(open(file,'rb').read())
- def google(query, numget=10, verbose=0):
- numget = int(numget)
- start = 0
- results = []
- if verbose == 2:
- print("[+]Getting " + str(numget) + " results")
- while len(results) < numget:
- print("[+]" + str(len(results)) + " so far...")
- data = uopen("https://www.google.com/search?q="+query+"&star="+str(start))
- if data.code != 200:
- print("Error " + str(data.code))
- break
- results.extend(re.findall("<a href="/%201D([^/%201D]*)/%201D">class=(?:1|s)",data.read()))
- print(data.read())
- start += 10
- if verbose == 2:
- print("[+] Got " + str(numget) + " results")
- return results[:numget]
- def genWordlist(targetlist, word_reg, outfile, verbose=0, quotes=True):
- quote_reg = re.compile("\"([^\"]{2,35})\"")
- ###
- ### Initialize Engine
- ###
- words = []
- append = False
- total_wb = 0
- dircount = 0
- totalcount = 0
- ###
- ### Read the old list
- ###
- if outfile.startswith("+"):
- outfile = outfile[1:]
- words = open(outfile).readlines()
- append = True
- total_wb = len(words)
- ###
- ### Hit the sources
- ###
- for target in targetlist:
- data = None
- ###
- ### Get the data
- ###
- if os.path.isfile(target):
- data = open(target).read()
- elif os.path.isdir(target):
- dircount += 1 # for stats in end
- subtargets = os.listdir(target)
- for subtarget in subtargets:
- if os.path.isfile(subtarget):
- data = "\n\n" + os.read(subtarget)
- else:
- targetlist.append(subtarget) # We will get it next time around
- else:
- try:
- res = uopen(target)
- if res.code != 200:
- print("[!]Error: " + str(res.code))
- else:
- data = res.read()
- except Exception as e:
- print("[!]" + str(e))
- totalcount += 1
- if not data:
- if verbose:
- print("[-]No data from source: " + str(target))
- continue
- else:
- if verbose:
- sys.stdout.write(str(totalcount) + " of ~" + str(len(targetlist)) + " sources complete\r")
- sys.stdout.flush()
- else:
- pass
- ###
- ### Format the data
- ###
- data = re.sub("(<!--|-->)","",data) # keep comments as normal text
- data = re.sub("</?[^>]+>","",data) # remove html tags
- data = re.sub("\r|\n","",data) # make it a strait file
- ###
- ### Add the new words
- ###
- allwords = word_reg.findall(data)
- allquotes = quote_reg.findall(data)
- for quote in allquotes:
- allwords.append(quote)
- allwords.append(quote.replace(" ",""))
- #flw = ''
- #for each in quote.split(''):
- # if len(each) > 0:
- #flw += each[0]
- # if flw:
- #allwords.append(flw)
- for word in allwords:
- ###
- ### Mangle
- ###
- if( word.endswith('.') or
- word.endswith(',') or
- word.endswith('!') or
- word.endswith('?') or
- word.endswith(';') or
- word.endswith('"') or
- word.endswith('\'')):
- allwords.append(word.strip('.,!?;"\''))
- if re.match("\A.*\.(jpg|png|txt|com|html)\Z", word):
- allwords.append(word.rsplit('.',1)[0])
- ###
- ### Add
- ###
- if not word in words:
- words.append(word)
- total_wa = len(words)
- total_s = len(targetlist)
- words.sort()
- of = open(outfile, 'w')
- for word in words:
- of.write(word+"\n")
- of.close()
- if verbose:
- print("[+]Complete!")
- print("[+]" + str(total_wb) + " words in the list.")
- if append:
- print("[+]" + str(total_wa - total_wb) + " are new.")
- print("[+]Collected from " + str(total_s - dircount) + " sources.")
- if __name__ == "__main__":
- ###
- ### User input
- ###
- verbose = 2
- minlen = 6
- maxlen = None
- find_quotes = True
- wordrules = ["A-z", "A-z0-9","A-z0-9*-.!$#@%"]
- wordrule = None
- while not wordrule:
- print("Select a word rule:")
- for i,rule in enumerate(wordrules):
- print(str(i + 1) + " -- " + wordrules[i])
- print(str(i + 2) + " Custom (WARNING: ADVANCED!! not validation)")
- que = raw_input("Rule[1-" + str(i+2) + "]:")
- try:
- que = int(que.strip())
- except:
- que = -1
- if que == i+2:
- wordrule = raw_input("Wordrule:").strip()
- elif que < 1 or que > i+2:
- print("Not a valid selection")
- else:
- wordrule = wordrules[i-1]
- if not minlen:
- minlen = 3
- outfile = raw_input("Filename:")
- if os.path.exists(outfile) and not outfile.startswith("+"):
- que = raw_input("[?]This file exists! Overwrite[y|N]:")
- if not 'y' in que.lower():
- exit(0)
- targetlist = raw_input("Input target list, separate by ';' no space or quote\n" + "Use %g<query>%<numresults> to use google query sites\n" +
- "Targets:")
- targetlist = targetlist.split(';')
- for target in targetlist:
- if re.match("%g[^%]+%[0-9]+", target):
- if verbose == 2:
- print("[+]Google sources: " + target[2:].split('%')[0])
- new_targets = google(target[2:].split("%")[0],target[2:].split("%")[1],verbose)
- targetlist.remove(target)
- targetlist.extend(new_targets)
- if verbose == 2:
- print("[+]Gathering data from the following targets:")
- for target in targetlist:
- print("[+]" + target)
- print("================================================")
- ###
- ### Prepare and call
- ###
- word_reg = re.compile("([" + wordrule + "]{" + str(minlen) + "," + str(maxlen) + "})")
- genWordlist(targetlist,word_reg,outfile,verbose)
Advertisement
Add Comment
Please, Sign In to add comment