article_four_word_grabber

#/bin/bash
# Convert text documents to an alphabetic JSON list of unique words
# By Anthony Hartup. From the fifth article in the Python Password Analyzer series:
# https://anthscomputercave.com/tutorials/code/python_password_cracker_word_list.html
#words.txt
#source.txt
#article_four_word_grabber.py

import json
from collections import OrderedDict


wordfile = "source.txt"   # Source text
wordlist = "words.txt"  # JSON list to store all_words array

# mode is 'new' to create new list, 'append' to add to existing list
mode = "append"

# Alphabetic array to store words
all_words = {"a": [], "b": [], "c": [], "d": [], "e": [], "f": [], "g": [], "h": [], "i": [], \
            "j": [], "k": [], "l": [], "m": [], "n": [], "o": [], "p": [], "r": [], "s": [], \
            "t": [], "u": [], "w": [], "x": [], "y": [],  "q": [], "v": [], "z": [], "common": []}
# Destination in words array, letters or common
destination = "letters"

line_count = 0  # Number of lines in source file
word_count = 0 # Number of unique words collected

# Items to exclude from words
junk = ['"', ".", ",", "!", "?",":", ";", "\n"]
numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']

# Load existing word list
if mode != "new":
    with open(wordlist) as word_file:
        word_holder = json.load(word_file)
        if len(word_holder[0]) > 0:
            all_words = word_holder[0]
            for word in all_words:
                word_count += len(all_words[word])
            print("Words loaded: " + str(word_count))

# Open the source file to read text
word_source = open(wordfile, 'r')
for line in word_source:
    if len(str(line)) > 1:
        # Split line into individual words
        words = str(line).split(" ")
        for w in words:
            # Check that word has length
            if w != "":
                bare_word = str(w)

                # Handle single quotes
                if "'" in bare_word:
                    if bare_word.index("'") == 0 or bare_word.index("'") == -1:
                        # If used as quotation marks, remove all
                        bare_word = bare_word.replace("'", "")
                    else:
                        # If used as apostrophe, remove apostrophe and letters after
                        bare_word = bare_word.split("'")[0]

                # Remove junk characters
                for j in junk:
                    if j in bare_word:
                        bare_word = bare_word.replace(j, "")

                # Convert word to lower-case
                bare_word = bare_word.lower()

                # Remove hidden characters
                bare_word = bare_word.strip()

                # Check that word still has length, and begins with a letter
                if len(bare_word) > 1 and bare_word[0] in all_words:

                    # These may be in texts copied from .docx format
                    bare_word = bare_word.replace(u'\u201d', '')
                    bare_word = bare_word.replace(u'\u2026', '')
                    bare_word = bare_word.replace(u'\u2019s', '')
                    bare_word = bare_word.replace(u'\u2019t', '')
                    bare_word = bare_word.replace(u'\u2019', '')
                    bare_word = bare_word.replace(u'\u2019ll', '')
                    bare_word = bare_word.replace(u'\u2019d', '')
                    bare_word = bare_word.replace('\n', '')

                    # Split word if hyphenated
                    second_word = ""
                    if "-" in bare_word:
                        second_word = bare_word.split("-")[1]
                        bare_word = bare_word.split("-")[0]

                    # Add word to either alphabetical or common section of list
                    if bare_word not in all_words[bare_word[0]] and bare_word not in all_words["common"]:
                        if destination == "letters":
                            all_words[bare_word[0]].append(bare_word)
                        else:
                            all_words["common"].append(bare_word)
                        word_count += 1

                    # Add second word if original was hyphenated
                    if second_word != "" and second_word[0] in all_words:
                        if second_word not in all_words[second_word[0]] and second_word not in all_words["common"]:
                            if destination == "letters":
                                all_words[second_word[0]].append(second_word)
                            else:
                                all_words["common"].append(second_word)
                            word_count += 1
    line_count += 1

# Write updated list to file
holder = [all_words]
with open(wordlist, 'w') as data_file:
    json.dump(holder, data_file)

# Created array to hold number of words for each letter
ordered_letters = {}
for let in all_words:
    ordered_letters[let] = len(all_words[let])
    print(let)
    print(len(all_words[let]))

# Print the letters in order of most words
letter_order = sorted([(value,key) for (key,value) in ordered_letters.items()], reverse=True)
for i in letter_order:
    print(i[1])

print("Lines read: " + str(line_count))
print("Unique words: " + str(word_count))