pyAnagramFinder

# Script to find all the anagrams within a text file
# I used "Alice In Wonderland" as a test.
# This file is 149 kilobytes long, which is a trivial amount of data for a modern PC

# I decided to treat abbreviations such as hasn't and shan't as valid words and anagrams
# Some fancy processing involving capital Q in place of ' in otherwise lowercase text fixes that

# Mike Kerry - June 2021
# [email protected]

import re                               # import regex module for word extraction
import time                             # import time module for computing elapsed time

st = time.time()                        # get the Unix time at the start
fin = open("Alice.txt")
mystr = fin.read()                      # read the whole text file into one string in memory
mystr = mystr.lower()                   # Convert the whole shooting match to lower case
mystr = mystr.replace("'", "Q")         # single quote becomes "Q" so e.g. shan't -> shanQt
wordlist = re.split(r"\W+", mystr)      # split the string into words, making a list of words
anagdict = {}                           # create empty dictionary. It will be a dict of lists.
for word in wordlist:                   # process each word from the text
    if word.startswith("Q") or word.endswith("Q"):        # Test for ' at start or end of word ...
        continue                                          # ... and reject. But e.g. shan't is OK
    sortword = "".join(sorted(word))        # sort the letters to make a dictionary key
    if sortword in anagdict:                # if this key already exists ... we have an anagram
        if word not in anagdict[sortword]:  # test to ignore duplicate anagrams
            anagdict[sortword].append(word)     # add this new word to this dictionary list
    else:
        anagdict[sortword] = [word]         # create a new dictionary entry as a single item list

# All the text has been processed. We can now print all the anagrams found
ctr = 0
for anaglist in anagdict.values():
    if len(anaglist) > 1:                   # only select entries where anagram(s) exist
        ctr += 1                            # count number of anagrams found
        for s in anaglist:
            print(s.replace("Q", "'"), end=" : ")       # Revert Q to single quote
        print()

print(ctr, "anagrams found in", round(time.time() - st, 4), "seconds")      # print the elapsed time

# 73 anagrams found in 0.08 seconds
#
# Process finished with exit code 0