Untitled

import math
o = [
'|de|  | f|Cl|nf|ed|au| i|ti|  |ma|ha|or|nn|ou| S|on|nd|on|',
'|ry|  |is|th|is| b|eo|as|  |  |f |wh| o|ic| t|, |  |he|h |',
'|ab|  |la|pr|od|ge|ob| m|an|  |s |is|el|ti|ng|il|d |ua|c |',
'|he|  |ea|of|ho| m| t|et|ha|  | t|od|ds|e |ki| c|t |ng|br|',
'|wo|m,|to|yo|hi|ve|u | t|ob|  |pr|d |s |us| s|ul|le|ol|e |',
'| t|ca| t|wi| M|d |th|"A|ma|l |he| p|at|ap|it|he|ti|le|er|',
'|ry|d |un|Th|" |io|eo|n,|is|  |bl|f |pu|Co|ic| o|he|at|mm|',
'|hi|  |  |in|  |  | t|  |  |  |  |ye|  |ar|  |s |  |  |. |']
# Constants
gram = 4 # Use 4-letter freqs or 2-letter freqs
max_dictsize = 3000 # Cap dictionary at a certain number of words to make dict creation much faster
# Decapitalize
for i in range(len(o)):
  for j in range(len(o[i])):
    if ord(o[i][j]) < 96 and ord(o[i][j]) > 64: o[i] = o[i][:j] + chr(ord(o[i][j]) + 32) + o[i][j+1:]
  print o[i]
# Open source file (eg. book)
dat = open('source.txt').read()
newdat = ""
for i in range(len(dat)):
  if ord(dat[i]) < 96 and ord(dat[i]) > 64: newdat += chr(ord(dat[i]) + 32)
  else: newdat += dat[i]
success = 1
try:
  # Try to open a dictionary if one was already made
  dic = open('dict'+str(gram))
  print "Dictionary found!"
except: success = 0
counts = {}
if success == 1:
  # Read dictionary file into counts
  for line in dic.readlines():
    if len(line) > gram + 2:
      counts[line[:gram]] = int(line[gram + 1:-1])
if success == 0:
  # Make a new dictionary from source
  dictsize = 0;
  print "Creating new dictionary"
  for i in range(1,len(dat) - gram + 1):
    if newdat[i:i+gram] in counts.keys():
      counts[newdat[i:i+gram]] += 1
    elif dictsize < max_dictsize:
      counts[newdat[i:i+gram]] = 1
      dictsize += 1
    if i % 1000 == 0: print str(i) + "/" + str(len(dat))
  dic = open('dict'+str(gram),'w')
  for w in counts:
    dic.write(w + " " + str(counts[w]) + "\n")
  dic.close()
# Probabilities of an n-gram
probs = {}
for x in counts.keys():
  probs[x] = (counts[x] + 1.0) / len(newdat)
# Match probability of two columns
def matchprob(a,b):
  if a == b: return 0
  p = 1
  for i in range(len(o)):
    combine = o[i][a * 3 + 1: a * 3 + 3] + o[i][b * 3 + 1: b * 3 + 3]
    if gram == 4: key = combine
    elif gram == 2: key = combine[1:3]
    p *= probs[key] if key in probs.keys() else 1.0/len(newdat)
  return p
# Shows output
def show(li):
  strs = []
  for i in range(len(o)):
    strs.append("")
    for j in li:
      strs[i] += o[i][j*3+1:j*3+3]
    print strs[i]
print "Got probability dict"
# Grab list of match probabilities' logarithms
matchproblogs = []
highest = 0
lowest = 99999
for i in range(19):
  matchproblogs.append([])
  for j in range(19):
    mp = matchprob(i,j)
    mlog = 99999 if mp == 0 else -math.log10(mp)
    matchproblogs[i].append(mlog)
    if mlog > highest and mlog < 99999: highest = mlog
    if mlog < lowest: lowest = mlog
print matchproblogs
print "Highest: " + str(highest) + " Lowest: " + str(lowest)
# Returns probability logarithm of an entire list of columns, any length 2-19
def getproblog(li):
  p = 0
  for i in range(len(li) - 1):
    p += matchproblogs[li[i]][li[i+1]]
  for i in range(len(li)):
    for j in range(i):
      if li[i] == li[j]: return 99999
  return p
def geth(li): # A* heuristic
  return getproblog(li) + lowest * (19 - len(li))
# Search tree of possible permutations
possibilities = []
for i in range(0,19): possibilities.append([[i],geth([i])])
place = 0
def binsearch(val):
  left = place
  right = len(possibilities) - 1
  while right - left > 1:
    ave = int((left + right) / 2)
    if possibilities[ave][1] < val: left = ave
    else: right = ave
  if possibilities[left][1] > val: return left
  elif possibilities[right][1] > val: return right
  return right + 1
stepto = 3
valid = []
while place < len(possibilities):
  if len(possibilities[place][0]) >= stepto:
    valid.append(possibilities[place][0])
    print "Found: " + str(len(valid))
  if len(valid) > 100:
    # To make our algorithm go much faster (ie. billions of years -> a few minutes) restart the search from midway points
    place = 0
    possibilities = []
    for i in range(100 if stepto < 19 else 1):
      print "Showing: " + str(i)
      show(valid[i])
      possibilities.append([valid[i],geth(valid[i])])
    valid = []
    stepto += 2
    if stepto > 19: break
  for i in range(19):
    # Add new paths from given path
    newli = possibilities[place][0] + [i]
    newp = geth(newli)
    if newp < 99999:
      newplace = binsearch(newp)
      possibilities = possibilities[:newplace] + [[newli,newp]] + possibilities[newplace:]
  place += 1