Advertisement
skip420

article_four_word_grabber

Jan 14th, 2021
1,059
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.50 KB | None | 0 0
  1. #/bin/bash
  2. # Convert text documents to an alphabetic JSON list of unique words
  3. # By Anthony Hartup. From the fifth article in the Python Password Analyzer series:
  4. # https://anthscomputercave.com/tutorials/code/python_password_cracker_word_list.html
  5. #words.txt
  6. #source.txt
  7. #article_four_word_grabber.py
  8.  
  9. import json
  10. from collections import OrderedDict
  11.  
  12.  
  13. wordfile = "source.txt"   # Source text
  14. wordlist = "words.txt"  # JSON list to store all_words array
  15.  
  16. # mode is 'new' to create new list, 'append' to add to existing list
  17. mode = "append"
  18.  
  19. # Alphabetic array to store words
  20. all_words = {"a": [], "b": [], "c": [], "d": [], "e": [], "f": [], "g": [], "h": [], "i": [], \
  21.             "j": [], "k": [], "l": [], "m": [], "n": [], "o": [], "p": [], "r": [], "s": [], \
  22.             "t": [], "u": [], "w": [], "x": [], "y": [],  "q": [], "v": [], "z": [], "common": []}
  23. # Destination in words array, letters or common
  24. destination = "letters"
  25.  
  26. line_count = 0  # Number of lines in source file
  27. word_count = 0 # Number of unique words collected
  28.  
  29. # Items to exclude from words
  30. junk = ['"', ".", ",", "!", "?",":", ";", "\n"]
  31. numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
  32.  
  33. # Load existing word list
  34. if mode != "new":
  35.     with open(wordlist) as word_file:
  36.         word_holder = json.load(word_file)
  37.         if len(word_holder[0]) > 0:
  38.             all_words = word_holder[0]
  39.             for word in all_words:
  40.                 word_count += len(all_words[word])
  41.             print("Words loaded: " + str(word_count))
  42.    
  43. # Open the source file to read text
  44. word_source = open(wordfile, 'r')
  45. for line in word_source:
  46.     if len(str(line)) > 1:
  47.         # Split line into individual words
  48.         words = str(line).split(" ")
  49.         for w in words:
  50.             # Check that word has length
  51.             if w != "":
  52.                 bare_word = str(w)
  53.                
  54.                 # Handle single quotes
  55.                 if "'" in bare_word:
  56.                     if bare_word.index("'") == 0 or bare_word.index("'") == -1:
  57.                         # If used as quotation marks, remove all
  58.                         bare_word = bare_word.replace("'", "")
  59.                     else:
  60.                         # If used as apostrophe, remove apostrophe and letters after
  61.                         bare_word = bare_word.split("'")[0]
  62.                        
  63.                 # Remove junk characters
  64.                 for j in junk:
  65.                     if j in bare_word:
  66.                         bare_word = bare_word.replace(j, "")
  67.                        
  68.                 # Convert word to lower-case
  69.                 bare_word = bare_word.lower()
  70.  
  71.                 # Remove hidden characters
  72.                 bare_word = bare_word.strip()
  73.                
  74.                 # Check that word still has length, and begins with a letter
  75.                 if len(bare_word) > 1 and bare_word[0] in all_words:
  76.                    
  77.                     # These may be in texts copied from .docx format
  78.                     bare_word = bare_word.replace(u'\u201d', '')
  79.                     bare_word = bare_word.replace(u'\u2026', '')
  80.                     bare_word = bare_word.replace(u'\u2019s', '')
  81.                     bare_word = bare_word.replace(u'\u2019t', '')
  82.                     bare_word = bare_word.replace(u'\u2019', '')
  83.                     bare_word = bare_word.replace(u'\u2019ll', '')
  84.                     bare_word = bare_word.replace(u'\u2019d', '')
  85.                     bare_word = bare_word.replace('\n', '')
  86.                    
  87.                     # Split word if hyphenated
  88.                     second_word = ""
  89.                     if "-" in bare_word:
  90.                         second_word = bare_word.split("-")[1]
  91.                         bare_word = bare_word.split("-")[0]
  92.                        
  93.                     # Add word to either alphabetical or common section of list
  94.                     if bare_word not in all_words[bare_word[0]] and bare_word not in all_words["common"]:
  95.                         if destination == "letters":
  96.                             all_words[bare_word[0]].append(bare_word)
  97.                         else:
  98.                             all_words["common"].append(bare_word)
  99.                         word_count += 1
  100.                        
  101.                     # Add second word if original was hyphenated    
  102.                     if second_word != "" and second_word[0] in all_words:                            
  103.                         if second_word not in all_words[second_word[0]] and second_word not in all_words["common"]:
  104.                             if destination == "letters":
  105.                                 all_words[second_word[0]].append(second_word)
  106.                             else:
  107.                                 all_words["common"].append(second_word)
  108.                             word_count += 1                                                              
  109.     line_count += 1
  110.    
  111. # Write updated list to file
  112. holder = [all_words]
  113. with open(wordlist, 'w') as data_file:
  114.     json.dump(holder, data_file)
  115.  
  116. # Created array to hold number of words for each letter
  117. ordered_letters = {}
  118. for let in all_words:
  119.     ordered_letters[let] = len(all_words[let])
  120.     print(let)
  121.     print(len(all_words[let]))
  122.    
  123. # Print the letters in order of most words
  124. letter_order = sorted([(value,key) for (key,value) in ordered_letters.items()], reverse=True)
  125. for i in letter_order:
  126.     print(i[1])
  127.    
  128. print("Lines read: " + str(line_count))
  129. print("Unique words: " + str(word_count))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement