Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import math
- o = [
- '|de| | f|Cl|nf|ed|au| i|ti| |ma|ha|or|nn|ou| S|on|nd|on|',
- '|ry| |is|th|is| b|eo|as| | |f |wh| o|ic| t|, | |he|h |',
- '|ab| |la|pr|od|ge|ob| m|an| |s |is|el|ti|ng|il|d |ua|c |',
- '|he| |ea|of|ho| m| t|et|ha| | t|od|ds|e |ki| c|t |ng|br|',
- '|wo|m,|to|yo|hi|ve|u | t|ob| |pr|d |s |us| s|ul|le|ol|e |',
- '| t|ca| t|wi| M|d |th|"A|ma|l |he| p|at|ap|it|he|ti|le|er|',
- '|ry|d |un|Th|" |io|eo|n,|is| |bl|f |pu|Co|ic| o|he|at|mm|',
- '|hi| | |in| | | t| | | | |ye| |ar| |s | | |. |']
- # Constants
- gram = 4 # Use 4-letter freqs or 2-letter freqs
- max_dictsize = 3000 # Cap dictionary at a certain number of words to make dict creation much faster
- # Decapitalize
- for i in range(len(o)):
- for j in range(len(o[i])):
- if ord(o[i][j]) < 96 and ord(o[i][j]) > 64: o[i] = o[i][:j] + chr(ord(o[i][j]) + 32) + o[i][j+1:]
- print o[i]
- # Open source file (eg. book)
- dat = open('source.txt').read()
- newdat = ""
- for i in range(len(dat)):
- if ord(dat[i]) < 96 and ord(dat[i]) > 64: newdat += chr(ord(dat[i]) + 32)
- else: newdat += dat[i]
- success = 1
- try:
- # Try to open a dictionary if one was already made
- dic = open('dict'+str(gram))
- print "Dictionary found!"
- except: success = 0
- counts = {}
- if success == 1:
- # Read dictionary file into counts
- for line in dic.readlines():
- if len(line) > gram + 2:
- counts[line[:gram]] = int(line[gram + 1:-1])
- if success == 0:
- # Make a new dictionary from source
- dictsize = 0;
- print "Creating new dictionary"
- for i in range(1,len(dat) - gram + 1):
- if newdat[i:i+gram] in counts.keys():
- counts[newdat[i:i+gram]] += 1
- elif dictsize < max_dictsize:
- counts[newdat[i:i+gram]] = 1
- dictsize += 1
- if i % 1000 == 0: print str(i) + "/" + str(len(dat))
- dic = open('dict'+str(gram),'w')
- for w in counts:
- dic.write(w + " " + str(counts[w]) + "\n")
- dic.close()
- # Probabilities of an n-gram
- probs = {}
- for x in counts.keys():
- probs[x] = (counts[x] + 1.0) / len(newdat)
- # Match probability of two columns
- def matchprob(a,b):
- if a == b: return 0
- p = 1
- for i in range(len(o)):
- combine = o[i][a * 3 + 1: a * 3 + 3] + o[i][b * 3 + 1: b * 3 + 3]
- if gram == 4: key = combine
- elif gram == 2: key = combine[1:3]
- p *= probs[key] if key in probs.keys() else 1.0/len(newdat)
- return p
- # Shows output
- def show(li):
- strs = []
- for i in range(len(o)):
- strs.append("")
- for j in li:
- strs[i] += o[i][j*3+1:j*3+3]
- print strs[i]
- print "Got probability dict"
- # Grab list of match probabilities' logarithms
- matchproblogs = []
- highest = 0
- lowest = 99999
- for i in range(19):
- matchproblogs.append([])
- for j in range(19):
- mp = matchprob(i,j)
- mlog = 99999 if mp == 0 else -math.log10(mp)
- matchproblogs[i].append(mlog)
- if mlog > highest and mlog < 99999: highest = mlog
- if mlog < lowest: lowest = mlog
- print matchproblogs
- print "Highest: " + str(highest) + " Lowest: " + str(lowest)
- # Returns probability logarithm of an entire list of columns, any length 2-19
- def getproblog(li):
- p = 0
- for i in range(len(li) - 1):
- p += matchproblogs[li[i]][li[i+1]]
- for i in range(len(li)):
- for j in range(i):
- if li[i] == li[j]: return 99999
- return p
- def geth(li): # A* heuristic
- return getproblog(li) + lowest * (19 - len(li))
- # Search tree of possible permutations
- possibilities = []
- for i in range(0,19): possibilities.append([[i],geth([i])])
- place = 0
- def binsearch(val):
- left = place
- right = len(possibilities) - 1
- while right - left > 1:
- ave = int((left + right) / 2)
- if possibilities[ave][1] < val: left = ave
- else: right = ave
- if possibilities[left][1] > val: return left
- elif possibilities[right][1] > val: return right
- return right + 1
- stepto = 3
- valid = []
- while place < len(possibilities):
- if len(possibilities[place][0]) >= stepto:
- valid.append(possibilities[place][0])
- print "Found: " + str(len(valid))
- if len(valid) > 100:
- # To make our algorithm go much faster (ie. billions of years -> a few minutes) restart the search from midway points
- place = 0
- possibilities = []
- for i in range(100 if stepto < 19 else 1):
- print "Showing: " + str(i)
- show(valid[i])
- possibilities.append([valid[i],geth(valid[i])])
- valid = []
- stepto += 2
- if stepto > 19: break
- for i in range(19):
- # Add new paths from given path
- newli = possibilities[place][0] + [i]
- newp = geth(newli)
- if newp < 99999:
- newplace = binsearch(newp)
- possibilities = possibilities[:newplace] + [[newli,newp]] + possibilities[newplace:]
- place += 1
Advertisement
Add Comment
Please, Sign In to add comment