Gale-church

import math, codecs
from itertools import izip
try:
  import scipy.stats.norm
  norm_logsf = scipy.stats.norm.logsf
except ImportError:
  def norm_cdf(z):
    """ Cumulative distribution for N(0, 1) """
    t = 1 / (1 + 0.2316419 * z)
    return (1 - 0.3989423 * math.exp(-z * z / 2) *
        ((((1.330274429 * t - 1.821255978) * t
          + 1.781477937) * t - 0.356563782) * t + 0.319381530) * t)

  def norm_logsf(z):
    """ Logarithm of the survival function for N(0, 1) """
    try:
      return math.log(1 - norm_cdf(z))
    except ValueError:
      return float('-inf')

# Alignment costs: -100*log(p(x:y)/p(1:1))
bead_costs = {
   (1, 1): 0,
   (2, 1): 230,
   (1, 2): 230,
   (0, 1): 450,
   (1, 0): 450,
   (2, 2): 440
}

# Length cost parameters
#mean_xy = 1
#variance_xy = 6.8
mean_xy = 0.725443229284
variance_xy = 77.640515511
LOG2 = math.log(2)

def length_cost(sx, sy):
  """ -100*log[p(|N(0, 1)|>delta)] """
  lx, ly = sum(sx), sum(sy)
  m = (lx + ly * mean_xy) / 2
  try:
    delta = (lx - ly * mean_xy) / math.sqrt(m * variance_xy)
  except ZeroDivisionError:
    return float('-inf')
  return - 100 * (LOG2 + norm_logsf(abs(delta)))


def _align(x, y):
  m = {}
  for i in range(len(x) + 1):
    for j in range(len(y) + 1):
      if i == j == 0:
        m[0, 0] = (0, 0, 0)
      else:
        m[i, j] = min((m[i-di, j-dj][0] +
                      length_cost(x[i-di:i], y[j-dj:j]) +
                      bead_cost, di, dj)
                      for (di, dj), bead_cost in bead_costs.iteritems()
                      if i-di>=0 and j-dj>=0)

  i, j = len(x), len(y)
  while True:
    (c, di, dj) = m[i, j]
    if di == dj == 0:
      break
    yield (i-di, i), (j-dj, j)
    i -= di
    j -= dj

# Returns sentence length
def sent_length(sentence):
  return len(" ".join(sentence).split(" "))
  #return sum(1 for c in sentence if c != ' ')


def align(sx, sy):
  """ Align two groups of sentences """
  '''print str(len(sx)), "sx:",sx
  print " ".join(sx).split(" ")
  print len(" ".join(sx).split(" "))'''
  #cx = map(sent_length, sx)
  #cy = map(sent_length, sy)
  cx = map(sent_length,sx); cy = map(sent_length, sy)
  for (i1, i2), (j1, j2) in reversed(list(_align(cx, cy))):
    yield ' '.join(sx[i1:i2]), ' '.join(sy[j1:j2])


# Reads file in alignment format, uses "#" to delimit paragraphs.
def readFile(filename):
  reader = codecs.open(filename, "r","utf8").readlines()
  reader = reader[1:]
  text = {} ;paragraph = []
  doc = ""
  for line in reader:
    if line.strip() == "#" or line[0] == "#":
      if paragraph != [] and doc != "":
        text[doc] = paragraph
        paragraph = []
      doc = line.strip()[::-1].split("/")[0][::-1].split(".")[0]
      continue
    else:
      paragraph.append(line.strip())
  return text

s = readFile('all.eng')
t = readFile('all.jpn')
g = readFile('en-ja.human.clean')


glen,res, srl, tgl = 0,0,0,0
for sd,td in izip(sorted(s),sorted(t)):
  src = s[sd]; trg = t[td]
  answers = []
  for (sentence_x, sentence_y) in align(src, trg):
    answers.append(sentence_x + "\t" + sentence_y)
    #print('%s ||| %s' % (sentence_x, sentence_y))
  if g.has_key(sd):
    print "##################################"
    print "#", sd
    print "###GOLD###"
    results = 0
    for gold in g[sd]:
      match = [ans for ans in answers if ans.replace(" ","").lower() == gold.replace(" ","").lower()]
      results+=len(match)
      print gold
    print "###ANS###"
    print "RESULTS: ", results, "matches out of", str(len(g[sd])),"aligned sentences"
    print "SRC_len:"+str(len(src)),"TRG_len:"+str(len(trg))
    res +=results; srl +=len(src); tgl +=len(trg); glen+=len(g[sd])
    for a in answers:
      print a

print "%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%"
print "Match_with_gold:", str(res),"out of",str(glen),"gold aligned sentences"
print "out of",srl,"source sentences and",tgl,"target sentences"

'''
def read_blocks(f):
  block = []
  for l in f:
    if not l.strip():
      yield block
      block = []
    else:
      block.append(l.strip())
  if block:
    yield block


def main(corpus_x, corpus_y):
  with open(corpus_x) as fx, open(corpus_y) as fy:
    for block_x, block_y in izip(read_blocks(fx), read_blocks(fy)):
      for (sentence_x, sentence_y) in align(block_x, block_y):
        print('%s ||| %s' % (sentence_x, sentence_y))


if __name__ == '__main__':
  import sys
  if len(sys.argv) != 3:
    sys.stderr.write('Usage: %s corpus.x corpus.y\n' % sys.argv[0])
    sys.exit(1)
  main(*sys.argv[1:])'''