calmean

import codecs, math
from itertools import izip, izip_longest
from scipy.integrate import quad
import scipy.stats as stat
import numpy as np
from scipy import stats


from numpy import arange,array,ones,linalg
from pylab import plot,show
from numpy import loadtxt, zeros, ones, array, linspace, logspace, cov
from pylab import *

# Reads file in alignment format, uses "#" to delimit paragraphs.
def readFile(filename):
  reader = codecs.open(filename, "r","utf8").readlines()
  reader = reader[1:]
  text = [] ;paragraph = []
  for line in reader:
    if line.strip() == "#" or line[0] == "#":
      text.append(paragraph)
      paragraph = []
      continue
    else:
      paragraph.append(line.strip())
  return text

# Calculate word/char length of text from the output of readFile().
def calculateLength(text, option="char"):
  if option == "word":
    length = len(text.split(" "))
  else:
    length = len(text)
  return length

'''def text2flat(text):
  txt = ""
  for p in text:
    txt+=" "; txt+=" ".join(p)
  return txt'''

# Caluclate mean length: mean = len(text2) / len(txt1)
def calculateMean(para1, para2, option='char'):
  if option == "gale-church":
    return int(1)
  src_len, trg_len = 0,0
  for i in para1:
    src_len+=calculateLength(i,option)
  for j in para2:
    trg_len+=calculateLength(j,option)
  c = trg_len/float(src_len)
  return c

# Calculate covariance
def calculateVariance(text1, text2, option='char'):
  try:
    src = readFile(text1); trg = readFile(text2)
  except:
    src = [text1]; trg = [text2]
  srcPara_lens = [] # x-axis
  diffsquares = [] # y-axis
  trgPara_lens = []
  for p in src:
    srcPara_lens.append(calculateLength(" ".join(p),option))
  for p in trg:
    trgPara_lens.append(calculateLength(" ".join(p),option))
  for ps, pt in izip(src, trg):
    #print calculateLength(" ".join(pt),"word"), calculateLength(" ".join(ps),"word")
    diff = (math.pow(calculateLength(" ".join(pt),option) - \
                                calculateLength(" ".join(ps),option),2))
    diffsquares.append(diff)

  print srcPara_lens; print trgPara_lens; print diffsquares
  (m,b)=polyfit(srcPara_lens,diffsquares,1)
  print m; plotGraph(srcPara_lens,diffsquares)
  return m

def plotGraph(xlist,ylist,label4x="x-axis",label4y="y-axis"):
  (m,b)=polyfit(xlist,ylist,1)
  yp=polyval([m,b],xlist)
  #print m, b
  plot(xlist,yp); scatter(xlist,ylist)
  grid(True)
  xlabel(label4x); ylabel(label4y); show()
  return None

def list2data(xlist,ylist):
  arraylist = []
  for x,y in izip(xlist, ylist):
    arraylist.append([x,y])
  data = np.array(arraylist)
  return data

def getCovarianceMatrix(xlist, ylist):
  arraylist = []
  for x,y in izip(xlist, ylist):
    arraylist.append([x,y])
  data = np.array(arraylist)
  covariance = np.cov(data.T)
  return covariance

def calculateSigma(l1,l2,mean,variance):
  return (l2 - (l1*mean)) / math.pow(variance*l2,1/2)

#def calculateProbSigma(sigma):

def func(z):
    return exp(-1*math.pow(z,2)/2)

# Equation 26.2.17 from Abramowitz and Stegun (1964:p.932) ,
# error(x) is ignored cos negligible, |error(x)| < 7.5*math.pow(10,-8)
def pnorm(z):
  t = 1/float(1+0.2316419*z) # t = 1/(1+pz) , z=0.2316419
  pd = 1 - 0.3989423*exp(-z*z/2) * ((0.319381530*t)+ \
                                    (-0.356563782*math.pow(t,2))+ \
                                    (1.781477937*math.pow(t,3)) + \
                                    (-1.821255978*math.pow(t,4)) + \
                                    (1.330274429*math.pow(t,5)))
  return pd

def calculateProbSigma(sent1,sent2,option="char"):
  c = calculateMean(sent1,sent2,option) # c= numchar(sent1) / numchar(sent2)
  s2 = calculateVariance(sent1,sent2,option) # s2 = variance

  len1 = calculateLength(sent1,option)
  len2 = calculateLength(sent2,option)
  mean = ( (len1 + len2)/c ) / 2
  z = (c*len1 - len2) / math.pow((s2*mean), 0.5)
  if (z<0): z = -1*z  # Just in case z is negative.
  pd = 2 * (1-pnorm(z))
  if (pd>0):
    return -100 * math.log10(pd)
  else:
    return 2500

def getXthPlusYSentence(paragraph,x,y):
  try:
    return paragraph[x] + " " + paragraph[x+y]
  except:
    return paragraph[x]


############################################################################
'''sentence1 = "this is a sentence ."
sentence2 = "c'est le sentence ."
sentence3 = "whatever shit this is , it doesnt matches anyshit ."
sentence4 = "this might be sentence ."

#print calculateProbSigma(sentence1, sentence2, "word")
#print calculateProbSigma(sentence3, sentence2, "word")
#print calculateProbSigma(sentence4, sentence2, "word")

print calculateLength(sentence1, "char")
print calculateLength(sentence2, "char")
print calculateProbSigma(sentence1, sentence2, "char")'''

#print calculateVariance('all.eng','all.jpn', 'word')
means = []
for x,y in izip(readFile('all.eng'),readFile('all.jpn')): # Reads each paragrah.
  means.append(calculateMean(x,y))

print means
print sum(means)/len(means)

#print calculateMean("srctext","trgtext")
'''calculateVariance("all.eng","all.jpn","word")
print
calculateVariance("all.eng","all.ind","word")
print
calculateVariance("all.ind","all.jpn","word")'''