Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # Script to find all the anagrams within a text file
- # I used "Alice In Wonderland" as a test.
- # This file is 149 kilobytes long, which is a trivial amount of data for a modern PC
- # I decided to treat abbreviations such as hasn't and shan't as valid words and anagrams
- # Some fancy processing involving capital Q in place of ' in otherwise lowercase text fixes that
- # Mike Kerry - June 2021
- # acclivity2@gmail.com
- import re # import regex module for word extraction
- import time # import time module for computing elapsed time
- st = time.time() # get the Unix time at the start
- fin = open("Alice.txt")
- mystr = fin.read() # read the whole text file into one string in memory
- mystr = mystr.lower() # Convert the whole shooting match to lower case
- mystr = mystr.replace("'", "Q") # single quote becomes "Q" so e.g. shan't -> shanQt
- wordlist = re.split(r"\W+", mystr) # split the string into words, making a list of words
- anagdict = {} # create empty dictionary. It will be a dict of lists.
- for word in wordlist: # process each word from the text
- if word.startswith("Q") or word.endswith("Q"): # Test for ' at start or end of word ...
- continue # ... and reject. But e.g. shan't is OK
- sortword = "".join(sorted(word)) # sort the letters to make a dictionary key
- if sortword in anagdict: # if this key already exists ... we have an anagram
- if word not in anagdict[sortword]: # test to ignore duplicate anagrams
- anagdict[sortword].append(word) # add this new word to this dictionary list
- else:
- anagdict[sortword] = [word] # create a new dictionary entry as a single item list
- # All the text has been processed. We can now print all the anagrams found
- ctr = 0
- for anaglist in anagdict.values():
- if len(anaglist) > 1: # only select entries where anagram(s) exist
- ctr += 1 # count number of anagrams found
- for s in anaglist:
- print(s.replace("Q", "'"), end=" : ") # Revert Q to single quote
- print()
- print(ctr, "anagrams found in", round(time.time() - st, 4), "seconds") # print the elapsed time
- # 73 anagrams found in 0.08 seconds
- #
- # Process finished with exit code 0
Add Comment
Please, Sign In to add comment