Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #/bin/bash
- # Convert text documents to an alphabetic JSON list of unique words
- # By Anthony Hartup. From the fifth article in the Python Password Analyzer series:
- # https://anthscomputercave.com/tutorials/code/python_password_cracker_word_list.html
- #words.txt
- #source.txt
- #article_four_word_grabber.py
- import json
- from collections import OrderedDict
- wordfile = "source.txt" # Source text
- wordlist = "words.txt" # JSON list to store all_words array
- # mode is 'new' to create new list, 'append' to add to existing list
- mode = "append"
- # Alphabetic array to store words
- all_words = {"a": [], "b": [], "c": [], "d": [], "e": [], "f": [], "g": [], "h": [], "i": [], \
- "j": [], "k": [], "l": [], "m": [], "n": [], "o": [], "p": [], "r": [], "s": [], \
- "t": [], "u": [], "w": [], "x": [], "y": [], "q": [], "v": [], "z": [], "common": []}
- # Destination in words array, letters or common
- destination = "letters"
- line_count = 0 # Number of lines in source file
- word_count = 0 # Number of unique words collected
- # Items to exclude from words
- junk = ['"', ".", ",", "!", "?",":", ";", "\n"]
- numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
- # Load existing word list
- if mode != "new":
- with open(wordlist) as word_file:
- word_holder = json.load(word_file)
- if len(word_holder[0]) > 0:
- all_words = word_holder[0]
- for word in all_words:
- word_count += len(all_words[word])
- print("Words loaded: " + str(word_count))
- # Open the source file to read text
- word_source = open(wordfile, 'r')
- for line in word_source:
- if len(str(line)) > 1:
- # Split line into individual words
- words = str(line).split(" ")
- for w in words:
- # Check that word has length
- if w != "":
- bare_word = str(w)
- # Handle single quotes
- if "'" in bare_word:
- if bare_word.index("'") == 0 or bare_word.index("'") == -1:
- # If used as quotation marks, remove all
- bare_word = bare_word.replace("'", "")
- else:
- # If used as apostrophe, remove apostrophe and letters after
- bare_word = bare_word.split("'")[0]
- # Remove junk characters
- for j in junk:
- if j in bare_word:
- bare_word = bare_word.replace(j, "")
- # Convert word to lower-case
- bare_word = bare_word.lower()
- # Remove hidden characters
- bare_word = bare_word.strip()
- # Check that word still has length, and begins with a letter
- if len(bare_word) > 1 and bare_word[0] in all_words:
- # These may be in texts copied from .docx format
- bare_word = bare_word.replace(u'\u201d', '')
- bare_word = bare_word.replace(u'\u2026', '')
- bare_word = bare_word.replace(u'\u2019s', '')
- bare_word = bare_word.replace(u'\u2019t', '')
- bare_word = bare_word.replace(u'\u2019', '')
- bare_word = bare_word.replace(u'\u2019ll', '')
- bare_word = bare_word.replace(u'\u2019d', '')
- bare_word = bare_word.replace('\n', '')
- # Split word if hyphenated
- second_word = ""
- if "-" in bare_word:
- second_word = bare_word.split("-")[1]
- bare_word = bare_word.split("-")[0]
- # Add word to either alphabetical or common section of list
- if bare_word not in all_words[bare_word[0]] and bare_word not in all_words["common"]:
- if destination == "letters":
- all_words[bare_word[0]].append(bare_word)
- else:
- all_words["common"].append(bare_word)
- word_count += 1
- # Add second word if original was hyphenated
- if second_word != "" and second_word[0] in all_words:
- if second_word not in all_words[second_word[0]] and second_word not in all_words["common"]:
- if destination == "letters":
- all_words[second_word[0]].append(second_word)
- else:
- all_words["common"].append(second_word)
- word_count += 1
- line_count += 1
- # Write updated list to file
- holder = [all_words]
- with open(wordlist, 'w') as data_file:
- json.dump(holder, data_file)
- # Created array to hold number of words for each letter
- ordered_letters = {}
- for let in all_words:
- ordered_letters[let] = len(all_words[let])
- print(let)
- print(len(all_words[let]))
- # Print the letters in order of most words
- letter_order = sorted([(value,key) for (key,value) in ordered_letters.items()], reverse=True)
- for i in letter_order:
- print(i[1])
- print("Lines read: " + str(line_count))
- print("Unique words: " + str(word_count))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement