Untitled

#python3
from collections import defaultdict, Counter
import pprint as pp
DV_KEYS = [
"',.pyfgcrl",
"aoeuidhtns",
";qjkxbmwvz",
]
KEYS = [
"qwertyuiop",
"asdfghjkl;",
"zxcvbnm,./",
]
MIRRORS = {}
for row in KEYS:
    rev = "".join(reversed(row))
    for x in range(len(row)//2):
        o = -1*(x + 1)
        MIRRORS[row[x]] = row[o]
        MIRRORS[row[o]] = row[x]
pp.pprint(MIRRORS)


def key(word):
    return "".join(min(letter, MIRRORS[letter]) for letter in word.lower())

words_by_key = defaultdict(set)
total_words = 0
bad_words = set()
with open("/usr/share/dict/words") as f:
    for word in f:
        word = word.strip().lower()
        try:
            words_by_key[key(word)].add(word)
        except KeyError:
            bad_words.add(word)
        total_words += 1
processed_words = total_words - len(bad_words)

print(total_words, "words total")
print(len(bad_words), "words unable to process: ", list(bad_words)[:10])

lens = Counter((len(val) for val in words_by_key.values()))
print("Size of groups: (size of 1 means no collisions, 2 means 1 collision, etc.")
print(lens.most_common())

print("Probability of a word having N collisions:")
for numcoll, count in sorted(lens.most_common()):
    probability = numcoll * count / processed_words * 100
    print(numcoll-1, probability)

cc = 0
print("Some sample collisions:")
for wds in words_by_key.values():
    if len(wds) > 1:
        cc += 1
        print(wds)
        if cc > 10:
            break

# QWERTY
# 235886 words total
# 2 words unable to process:  ['jean-pierre', 'jean-christophe']
# Size of groups: (size of 1 means no collisions, 2 means 1 collision, etc.
# [(1, 221334), (2, 5101), (3, 602), (4, 165), (5, 38), (6, 20), (7, 7), (8, 1)]
# Probability of a word having N collisions:
# 0 93.83171389326958
# 1 4.325007206932221
# 2 0.7656305641756117
# 3 0.27979854504756574
# 4 0.08054806599854165
# 5 0.05087246273592105
# 6 0.020772922283834427
# 7 0.00339149751572807
# Some sample collisions:
# {'dub', 'dun'}
# {'killable', 'kissable'}
# {'percival', 'perceval'}
# {'it', 'ey'}
# {'scruf', 'scurf'}
# {'silverness', 'silverbill'}
# {'singer', 'linger'}
# {'wade', 'wake', 'wadi'}
# {'jag', 'fag'}
# {'wryly', 'outly'}
# {'pegasian', 'pegasean'}

# DVORAK
# 235886 words total
# 2 words unable to process:  ['jean-pierre', 'jean-christophe']
# Size of groups: (size of 1 means no collisions, 2 means 1 collision, etc.
# [(1, 227220), (2, 3017), (3, 305), (4, 46), (5, 2), (6, 1)]
# Probability of a word having N collisions:
# 0 96.3270081904665
# 1 2.5580370012378966
# 2 0.387902528361398
# 3 0.0780044428617456
# 4 0.004239371894660088
# 5 0.002543623136796052
# Some sample collisions:
# {'apathism', 'agathism'}
# {'balk', 'balm'}
# {'unary', 'hoary'}
# {'cypris', 'cypria'}
# {'indiscreetly', 'indiscretely'}
# {'pump', 'gump'}
# {'yond', 'food'}
# {'getae', 'geest'}
# {'trig', 'trip'}
# {'apselaphesia', 'apselaphesis'}
# {'tach', 'each'}