Advertisement
alvations

Gale-church

Jan 9th, 2013
247
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 4.56 KB | None | 0 0
  1. import math, codecs
  2. from itertools import izip
  3. try:
  4.   import scipy.stats.norm
  5.   norm_logsf = scipy.stats.norm.logsf
  6. except ImportError:
  7.   def norm_cdf(z):
  8.     """ Cumulative distribution for N(0, 1) """
  9.     t = 1 / (1 + 0.2316419 * z)
  10.     return (1 - 0.3989423 * math.exp(-z * z / 2) *
  11.         ((((1.330274429 * t - 1.821255978) * t
  12.           + 1.781477937) * t - 0.356563782) * t + 0.319381530) * t)
  13.  
  14.   def norm_logsf(z):
  15.     """ Logarithm of the survival function for N(0, 1) """
  16.     try:
  17.       return math.log(1 - norm_cdf(z))
  18.     except ValueError:
  19.       return float('-inf')
  20.  
  21. # Alignment costs: -100*log(p(x:y)/p(1:1))
  22. bead_costs = {
  23.    (1, 1): 0,
  24.    (2, 1): 230,
  25.    (1, 2): 230,
  26.    (0, 1): 450,
  27.    (1, 0): 450,
  28.    (2, 2): 440
  29. }
  30.  
  31. # Length cost parameters
  32. #mean_xy = 1
  33. #variance_xy = 6.8
  34. mean_xy = 0.725443229284
  35. variance_xy = 77.640515511
  36. LOG2 = math.log(2)
  37.  
  38. def length_cost(sx, sy):
  39.   """ -100*log[p(|N(0, 1)|>delta)] """
  40.   lx, ly = sum(sx), sum(sy)
  41.   m = (lx + ly * mean_xy) / 2
  42.   try:
  43.     delta = (lx - ly * mean_xy) / math.sqrt(m * variance_xy)
  44.   except ZeroDivisionError:
  45.     return float('-inf')
  46.   return - 100 * (LOG2 + norm_logsf(abs(delta)))
  47.  
  48.  
  49. def _align(x, y):
  50.   m = {}
  51.   for i in range(len(x) + 1):
  52.     for j in range(len(y) + 1):
  53.       if i == j == 0:
  54.         m[0, 0] = (0, 0, 0)
  55.       else:
  56.         m[i, j] = min((m[i-di, j-dj][0] +
  57.                       length_cost(x[i-di:i], y[j-dj:j]) +
  58.                       bead_cost, di, dj)
  59.                       for (di, dj), bead_cost in bead_costs.iteritems()
  60.                       if i-di>=0 and j-dj>=0)
  61.  
  62.   i, j = len(x), len(y)
  63.   while True:
  64.     (c, di, dj) = m[i, j]
  65.     if di == dj == 0:
  66.       break
  67.     yield (i-di, i), (j-dj, j)
  68.     i -= di
  69.     j -= dj
  70.  
  71. # Returns sentence length
  72. def sent_length(sentence):
  73.   return len(" ".join(sentence).split(" "))
  74.   #return sum(1 for c in sentence if c != ' ')
  75.    
  76.  
  77. def align(sx, sy):
  78.   """ Align two groups of sentences """
  79.   '''print str(len(sx)), "sx:",sx
  80.  print " ".join(sx).split(" ")
  81.  print len(" ".join(sx).split(" "))'''
  82.   #cx = map(sent_length, sx)
  83.   #cy = map(sent_length, sy)
  84.   cx = map(sent_length,sx); cy = map(sent_length, sy)
  85.   for (i1, i2), (j1, j2) in reversed(list(_align(cx, cy))):
  86.     yield ' '.join(sx[i1:i2]), ' '.join(sy[j1:j2])
  87.  
  88.  
  89. # Reads file in alignment format, uses "#" to delimit paragraphs.
  90. def readFile(filename):
  91.   reader = codecs.open(filename, "r","utf8").readlines()
  92.   reader = reader[1:]
  93.   text = {} ;paragraph = []
  94.   doc = ""
  95.   for line in reader:
  96.     if line.strip() == "#" or line[0] == "#":
  97.       if paragraph != [] and doc != "":
  98.         text[doc] = paragraph
  99.         paragraph = []
  100.       doc = line.strip()[::-1].split("/")[0][::-1].split(".")[0]
  101.       continue
  102.     else:
  103.       paragraph.append(line.strip())
  104.   return text
  105.  
  106. s = readFile('all.eng')
  107. t = readFile('all.jpn')
  108. g = readFile('en-ja.human.clean')
  109.  
  110.  
  111. glen,res, srl, tgl = 0,0,0,0
  112. for sd,td in izip(sorted(s),sorted(t)):
  113.   src = s[sd]; trg = t[td]
  114.   answers = []
  115.   for (sentence_x, sentence_y) in align(src, trg):
  116.     answers.append(sentence_x + "\t" + sentence_y)
  117.     #print('%s ||| %s' % (sentence_x, sentence_y))
  118.   if g.has_key(sd):
  119.     print "##################################"
  120.     print "#", sd
  121.     print "###GOLD###"
  122.     results = 0
  123.     for gold in g[sd]:
  124.       match = [ans for ans in answers if ans.replace(" ","").lower() == gold.replace(" ","").lower()]
  125.       results+=len(match)
  126.       print gold
  127.     print "###ANS###"
  128.     print "RESULTS: ", results, "matches out of", str(len(g[sd])),"aligned sentences"
  129.     print "SRC_len:"+str(len(src)),"TRG_len:"+str(len(trg))
  130.     res +=results; srl +=len(src); tgl +=len(trg); glen+=len(g[sd])
  131.     for a in answers:
  132.       print a
  133.      
  134. print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"    
  135. print "Match_with_gold:", str(res),"out of",str(glen),"gold aligned sentences"
  136. print "out of",srl,"source sentences and",tgl,"target sentences"
  137.  
  138. '''
  139. def read_blocks(f):
  140.  block = []
  141.  for l in f:
  142.    if not l.strip():
  143.      yield block
  144.      block = []
  145.    else:
  146.      block.append(l.strip())
  147.  if block:
  148.    yield block
  149.  
  150.  
  151. def main(corpus_x, corpus_y):
  152.  with open(corpus_x) as fx, open(corpus_y) as fy:
  153.    for block_x, block_y in izip(read_blocks(fx), read_blocks(fy)):
  154.      for (sentence_x, sentence_y) in align(block_x, block_y):
  155.        print('%s ||| %s' % (sentence_x, sentence_y))
  156.  
  157.  
  158. if __name__ == '__main__':
  159.  import sys
  160.  if len(sys.argv) != 3:
  161.    sys.stderr.write('Usage: %s corpus.x corpus.y\n' % sys.argv[0])
  162.    sys.exit(1)
  163.  main(*sys.argv[1:])'''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement