skip420

article_four_word_grabber

Jan 14th, 2021
739
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #/bin/bash
  2. # Convert text documents to an alphabetic JSON list of unique words
  3. # By Anthony Hartup. From the fifth article in the Python Password Analyzer series:
  4. # https://anthscomputercave.com/tutorials/code/python_password_cracker_word_list.html
  5. #words.txt
  6. #source.txt
  7. #article_four_word_grabber.py
  8.  
  9. import json
  10. from collections import OrderedDict
  11.  
  12.  
  13. wordfile = "source.txt"   # Source text
  14. wordlist = "words.txt"  # JSON list to store all_words array
  15.  
  16. # mode is 'new' to create new list, 'append' to add to existing list
  17. mode = "append"
  18.  
  19. # Alphabetic array to store words
  20. all_words = {"a": [], "b": [], "c": [], "d": [], "e": [], "f": [], "g": [], "h": [], "i": [], \
  21.             "j": [], "k": [], "l": [], "m": [], "n": [], "o": [], "p": [], "r": [], "s": [], \
  22.             "t": [], "u": [], "w": [], "x": [], "y": [],  "q": [], "v": [], "z": [], "common": []}
  23. # Destination in words array, letters or common
  24. destination = "letters"
  25.  
  26. line_count = 0  # Number of lines in source file
  27. word_count = 0 # Number of unique words collected
  28.  
  29. # Items to exclude from words
  30. junk = ['"', ".", ",", "!", "?",":", ";", "\n"]
  31. numbers = ['1', '2', '3', '4', '5', '6', '7', '8', '9', '0']
  32.  
  33. # Load existing word list
  34. if mode != "new":
  35.     with open(wordlist) as word_file:
  36.         word_holder = json.load(word_file)
  37.         if len(word_holder[0]) > 0:
  38.             all_words = word_holder[0]
  39.             for word in all_words:
  40.                 word_count += len(all_words[word])
  41.             print("Words loaded: " + str(word_count))
  42.    
  43. # Open the source file to read text
  44. word_source = open(wordfile, 'r')
  45. for line in word_source:
  46.     if len(str(line)) > 1:
  47.         # Split line into individual words
  48.         words = str(line).split(" ")
  49.         for w in words:
  50.             # Check that word has length
  51.             if w != "":
  52.                 bare_word = str(w)
  53.                
  54.                 # Handle single quotes
  55.                 if "'" in bare_word:
  56.                     if bare_word.index("'") == 0 or bare_word.index("'") == -1:
  57.                         # If used as quotation marks, remove all
  58.                         bare_word = bare_word.replace("'", "")
  59.                     else:
  60.                         # If used as apostrophe, remove apostrophe and letters after
  61.                         bare_word = bare_word.split("'")[0]
  62.                        
  63.                 # Remove junk characters
  64.                 for j in junk:
  65.                     if j in bare_word:
  66.                         bare_word = bare_word.replace(j, "")
  67.                        
  68.                 # Convert word to lower-case
  69.                 bare_word = bare_word.lower()
  70.  
  71.                 # Remove hidden characters
  72.                 bare_word = bare_word.strip()
  73.                
  74.                 # Check that word still has length, and begins with a letter
  75.                 if len(bare_word) > 1 and bare_word[0] in all_words:
  76.                    
  77.                     # These may be in texts copied from .docx format
  78.                     bare_word = bare_word.replace(u'\u201d', '')
  79.                     bare_word = bare_word.replace(u'\u2026', '')
  80.                     bare_word = bare_word.replace(u'\u2019s', '')
  81.                     bare_word = bare_word.replace(u'\u2019t', '')
  82.                     bare_word = bare_word.replace(u'\u2019', '')
  83.                     bare_word = bare_word.replace(u'\u2019ll', '')
  84.                     bare_word = bare_word.replace(u'\u2019d', '')
  85.                     bare_word = bare_word.replace('\n', '')
  86.                    
  87.                     # Split word if hyphenated
  88.                     second_word = ""
  89.                     if "-" in bare_word:
  90.                         second_word = bare_word.split("-")[1]
  91.                         bare_word = bare_word.split("-")[0]
  92.                        
  93.                     # Add word to either alphabetical or common section of list
  94.                     if bare_word not in all_words[bare_word[0]] and bare_word not in all_words["common"]:
  95.                         if destination == "letters":
  96.                             all_words[bare_word[0]].append(bare_word)
  97.                         else:
  98.                             all_words["common"].append(bare_word)
  99.                         word_count += 1
  100.                        
  101.                     # Add second word if original was hyphenated    
  102.                     if second_word != "" and second_word[0] in all_words:                            
  103.                         if second_word not in all_words[second_word[0]] and second_word not in all_words["common"]:
  104.                             if destination == "letters":
  105.                                 all_words[second_word[0]].append(second_word)
  106.                             else:
  107.                                 all_words["common"].append(second_word)
  108.                             word_count += 1                                                              
  109.     line_count += 1
  110.    
  111. # Write updated list to file
  112. holder = [all_words]
  113. with open(wordlist, 'w') as data_file:
  114.     json.dump(holder, data_file)
  115.  
  116. # Created array to hold number of words for each letter
  117. ordered_letters = {}
  118. for let in all_words:
  119.     ordered_letters[let] = len(all_words[let])
  120.     print(let)
  121.     print(len(all_words[let]))
  122.    
  123. # Print the letters in order of most words
  124. letter_order = sorted([(value,key) for (key,value) in ordered_letters.items()], reverse=True)
  125. for i in letter_order:
  126.     print(i[1])
  127.    
  128. print("Lines read: " + str(line_count))
  129. print("Unique words: " + str(word_count))
RAW Paste Data

Adblocker detected! Please consider disabling it...

We've detected AdBlock Plus or some other adblocking software preventing Pastebin.com from fully loading.

We don't have any obnoxious sound, or popup ads, we actively block these annoying types of ads!

Please add Pastebin.com to your ad blocker whitelist or disable your adblocking software.

×