Guest User

Untitled

a guest
Dec 19th, 2011
54
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.61 KB | None | 0 0
  1. import math
  2. o = [
  3. '|de|  | f|Cl|nf|ed|au| i|ti|  |ma|ha|or|nn|ou| S|on|nd|on|',
  4. '|ry|  |is|th|is| b|eo|as|  |  |f |wh| o|ic| t|, |  |he|h |',
  5. '|ab|  |la|pr|od|ge|ob| m|an|  |s |is|el|ti|ng|il|d |ua|c |',
  6. '|he|  |ea|of|ho| m| t|et|ha|  | t|od|ds|e |ki| c|t |ng|br|',
  7. '|wo|m,|to|yo|hi|ve|u | t|ob|  |pr|d |s |us| s|ul|le|ol|e |',
  8. '| t|ca| t|wi| M|d |th|"A|ma|l |he| p|at|ap|it|he|ti|le|er|',
  9. '|ry|d |un|Th|" |io|eo|n,|is|  |bl|f |pu|Co|ic| o|he|at|mm|',
  10. '|hi|  |  |in|  |  | t|  |  |  |  |ye|  |ar|  |s |  |  |. |']
  11. # Constants
  12. gram = 4 # Use 4-letter freqs or 2-letter freqs
  13. max_dictsize = 3000 # Cap dictionary at a certain number of words to make dict creation much faster
  14. # Decapitalize
  15. for i in range(len(o)):
  16.   for j in range(len(o[i])):
  17.     if ord(o[i][j]) < 96 and ord(o[i][j]) > 64: o[i] = o[i][:j] + chr(ord(o[i][j]) + 32) + o[i][j+1:]
  18.   print o[i]
  19. # Open source file (eg. book)
  20. dat = open('source.txt').read()
  21. newdat = ""
  22. for i in range(len(dat)):
  23.   if ord(dat[i]) < 96 and ord(dat[i]) > 64: newdat += chr(ord(dat[i]) + 32)
  24.   else: newdat += dat[i]
  25. success = 1
  26. try:
  27.   # Try to open a dictionary if one was already made
  28.   dic = open('dict'+str(gram))
  29.   print "Dictionary found!"
  30. except: success = 0
  31. counts = {}
  32. if success == 1:
  33.   # Read dictionary file into counts
  34.   for line in dic.readlines():
  35.     if len(line) > gram + 2:
  36.       counts[line[:gram]] = int(line[gram + 1:-1])
  37. if success == 0:
  38.   # Make a new dictionary from source
  39.   dictsize = 0;
  40.   print "Creating new dictionary"
  41.   for i in range(1,len(dat) - gram + 1):
  42.     if newdat[i:i+gram] in counts.keys():
  43.       counts[newdat[i:i+gram]] += 1
  44.     elif dictsize < max_dictsize:
  45.       counts[newdat[i:i+gram]] = 1
  46.       dictsize += 1
  47.     if i % 1000 == 0: print str(i) + "/" + str(len(dat))
  48.   dic = open('dict'+str(gram),'w')
  49.   for w in counts:
  50.     dic.write(w + " " + str(counts[w]) + "\n")
  51.   dic.close()
  52. # Probabilities of an n-gram
  53. probs = {}
  54. for x in counts.keys():
  55.   probs[x] = (counts[x] + 1.0) / len(newdat)
  56. # Match probability of two columns
  57. def matchprob(a,b):
  58.   if a == b: return 0
  59.   p = 1
  60.   for i in range(len(o)):
  61.     combine = o[i][a * 3 + 1: a * 3 + 3] + o[i][b * 3 + 1: b * 3 + 3]
  62.     if gram == 4: key = combine
  63.     elif gram == 2: key = combine[1:3]
  64.     p *= probs[key] if key in probs.keys() else 1.0/len(newdat)
  65.   return p
  66. # Shows output
  67. def show(li):
  68.   strs = []
  69.   for i in range(len(o)):
  70.     strs.append("")
  71.     for j in li:
  72.       strs[i] += o[i][j*3+1:j*3+3]
  73.     print strs[i]
  74. print "Got probability dict"
  75. # Grab list of match probabilities' logarithms
  76. matchproblogs = []
  77. highest = 0
  78. lowest = 99999
  79. for i in range(19):
  80.   matchproblogs.append([])
  81.   for j in range(19):
  82.     mp = matchprob(i,j)
  83.     mlog = 99999 if mp == 0 else -math.log10(mp)
  84.     matchproblogs[i].append(mlog)
  85.     if mlog > highest and mlog < 99999: highest = mlog
  86.     if mlog < lowest: lowest = mlog
  87. print matchproblogs
  88. print "Highest: " + str(highest) + " Lowest: " + str(lowest)
  89. # Returns probability logarithm of an entire list of columns, any length 2-19
  90. def getproblog(li):
  91.   p = 0
  92.   for i in range(len(li) - 1):
  93.     p += matchproblogs[li[i]][li[i+1]]
  94.   for i in range(len(li)):
  95.     for j in range(i):
  96.       if li[i] == li[j]: return 99999
  97.   return p
  98. def geth(li): # A* heuristic
  99.   return getproblog(li) + lowest * (19 - len(li))
  100. # Search tree of possible permutations
  101. possibilities = []
  102. for i in range(0,19): possibilities.append([[i],geth([i])])
  103. place = 0
  104. def binsearch(val):
  105.   left = place
  106.   right = len(possibilities) - 1
  107.   while right - left > 1:
  108.     ave = int((left + right) / 2)
  109.     if possibilities[ave][1] < val: left = ave
  110.     else: right = ave
  111.   if possibilities[left][1] > val: return left
  112.   elif possibilities[right][1] > val: return right
  113.   return right + 1
  114. stepto = 3
  115. valid = []
  116. while place < len(possibilities):
  117.   if len(possibilities[place][0]) >= stepto:
  118.     valid.append(possibilities[place][0])
  119.     print "Found: " + str(len(valid))
  120.   if len(valid) > 100:
  121.     # To make our algorithm go much faster (ie. billions of years -> a few minutes) restart the search from midway points
  122.     place = 0
  123.     possibilities = []
  124.     for i in range(100 if stepto < 19 else 1):
  125.       print "Showing: " + str(i)
  126.       show(valid[i])
  127.       possibilities.append([valid[i],geth(valid[i])])
  128.     valid = []
  129.     stepto += 2
  130.     if stepto > 19: break
  131.   for i in range(19):
  132.     # Add new paths from given path
  133.     newli = possibilities[place][0] + [i]
  134.     newp = geth(newli)
  135.     if newp < 99999:
  136.       newplace = binsearch(newp)
  137.       possibilities = possibilities[:newplace] + [[newli,newp]] + possibilities[newplace:]
  138.   place += 1
Advertisement
Add Comment
Please, Sign In to add comment