acclivity

pyAnagramFinder

Jun 19th, 2021 (edited)
248
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.46 KB | None | 0 0
  1. # Script to find all the anagrams within a text file
  2. # I used "Alice In Wonderland" as a test.
  3. # This file is 149 kilobytes long, which is a trivial amount of data for a modern PC
  4.  
  5. # I decided to treat abbreviations such as hasn't and shan't as valid words and anagrams
  6. # Some fancy processing involving capital Q in place of ' in otherwise lowercase text fixes that
  7.  
  8. # Mike Kerry - June 2021
  9. # acclivity2@gmail.com
  10.  
  11. import re                               # import regex module for word extraction
  12. import time                             # import time module for computing elapsed time
  13.  
  14. st = time.time()                        # get the Unix time at the start
  15. fin = open("Alice.txt")
  16. mystr = fin.read()                      # read the whole text file into one string in memory
  17. mystr = mystr.lower()                   # Convert the whole shooting match to lower case
  18. mystr = mystr.replace("'", "Q")         # single quote becomes "Q" so e.g. shan't -> shanQt
  19. wordlist = re.split(r"\W+", mystr)      # split the string into words, making a list of words
  20. anagdict = {}                           # create empty dictionary. It will be a dict of lists.
  21. for word in wordlist:                   # process each word from the text
  22.     if word.startswith("Q") or word.endswith("Q"):        # Test for ' at start or end of word ...
  23.         continue                                          # ... and reject. But e.g. shan't is OK
  24.     sortword = "".join(sorted(word))        # sort the letters to make a dictionary key
  25.     if sortword in anagdict:                # if this key already exists ... we have an anagram
  26.         if word not in anagdict[sortword]:  # test to ignore duplicate anagrams
  27.             anagdict[sortword].append(word)     # add this new word to this dictionary list
  28.     else:
  29.         anagdict[sortword] = [word]         # create a new dictionary entry as a single item list
  30.  
  31. # All the text has been processed. We can now print all the anagrams found
  32. ctr = 0
  33. for anaglist in anagdict.values():
  34.     if len(anaglist) > 1:                   # only select entries where anagram(s) exist
  35.         ctr += 1                            # count number of anagrams found
  36.         for s in anaglist:
  37.             print(s.replace("Q", "'"), end=" : ")       # Revert Q to single quote
  38.         print()
  39.  
  40. print(ctr, "anagrams found in", round(time.time() - st, 4), "seconds")      # print the elapsed time
  41.  
  42. # 73 anagrams found in 0.08 seconds
  43. #
  44. # Process finished with exit code 0
  45.  
Add Comment
Please, Sign In to add comment