Advertisement
alvations

calmean

Jan 9th, 2013
240
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 5.21 KB | None | 0 0
  1. import codecs, math
  2. from itertools import izip, izip_longest
  3. from scipy.integrate import quad
  4. import scipy.stats as stat
  5. import numpy as np
  6. from scipy import stats
  7.  
  8.  
  9. from numpy import arange,array,ones,linalg
  10. from pylab import plot,show
  11. from numpy import loadtxt, zeros, ones, array, linspace, logspace, cov
  12. from pylab import *
  13.  
  14. # Reads file in alignment format, uses "#" to delimit paragraphs.
  15. def readFile(filename):
  16.   reader = codecs.open(filename, "r","utf8").readlines()
  17.   reader = reader[1:]
  18.   text = [] ;paragraph = []
  19.   for line in reader:
  20.     if line.strip() == "#" or line[0] == "#":
  21.       text.append(paragraph)
  22.       paragraph = []
  23.       continue
  24.     else:
  25.       paragraph.append(line.strip())
  26.   return text
  27.  
  28. # Calculate word/char length of text from the output of readFile().
  29. def calculateLength(text, option="char"):
  30.   if option == "word":
  31.     length = len(text.split(" "))
  32.   else:
  33.     length = len(text)
  34.   return length
  35.  
  36. '''def text2flat(text):
  37.  txt = ""
  38.  for p in text:
  39.    txt+=" "; txt+=" ".join(p)
  40.  return txt'''
  41.  
  42. # Caluclate mean length: mean = len(text2) / len(txt1)
  43. def calculateMean(para1, para2, option='char'):
  44.   if option == "gale-church":
  45.     return int(1)
  46.   src_len, trg_len = 0,0
  47.   for i in para1:
  48.     src_len+=calculateLength(i,option)
  49.   for j in para2:
  50.     trg_len+=calculateLength(j,option)
  51.   c = trg_len/float(src_len)
  52.   return c
  53.  
  54. # Calculate covariance
  55. def calculateVariance(text1, text2, option='char'):
  56.   try:
  57.     src = readFile(text1); trg = readFile(text2)
  58.   except:
  59.     src = [text1]; trg = [text2]
  60.   srcPara_lens = [] # x-axis
  61.   diffsquares = [] # y-axis
  62.   trgPara_lens = []
  63.   for p in src:
  64.     srcPara_lens.append(calculateLength(" ".join(p),option))
  65.   for p in trg:
  66.     trgPara_lens.append(calculateLength(" ".join(p),option))
  67.   for ps, pt in izip(src, trg):
  68.     #print calculateLength(" ".join(pt),"word"), calculateLength(" ".join(ps),"word")
  69.     diff = (math.pow(calculateLength(" ".join(pt),option) - \
  70.                                 calculateLength(" ".join(ps),option),2))
  71.     diffsquares.append(diff)
  72.    
  73.   print srcPara_lens; print trgPara_lens; print diffsquares
  74.   (m,b)=polyfit(srcPara_lens,diffsquares,1)
  75.   print m; plotGraph(srcPara_lens,diffsquares)
  76.   return m
  77.  
  78. def plotGraph(xlist,ylist,label4x="x-axis",label4y="y-axis"):
  79.   (m,b)=polyfit(xlist,ylist,1)
  80.   yp=polyval([m,b],xlist)
  81.   #print m, b
  82.   plot(xlist,yp); scatter(xlist,ylist)
  83.   grid(True)
  84.   xlabel(label4x); ylabel(label4y); show()
  85.   return None
  86.  
  87. def list2data(xlist,ylist):
  88.   arraylist = []
  89.   for x,y in izip(xlist, ylist):
  90.     arraylist.append([x,y])
  91.   data = np.array(arraylist)
  92.   return data
  93.  
  94. def getCovarianceMatrix(xlist, ylist):
  95.   arraylist = []
  96.   for x,y in izip(xlist, ylist):
  97.     arraylist.append([x,y])
  98.   data = np.array(arraylist)
  99.   covariance = np.cov(data.T)
  100.   return covariance
  101.  
  102. def calculateSigma(l1,l2,mean,variance):
  103.   return (l2 - (l1*mean)) / math.pow(variance*l2,1/2)
  104.  
  105. #def calculateProbSigma(sigma):
  106.  
  107. def func(z):
  108.     return exp(-1*math.pow(z,2)/2)
  109.  
  110. # Equation 26.2.17 from Abramowitz and Stegun (1964:p.932) ,
  111. # error(x) is ignored cos negligible, |error(x)| < 7.5*math.pow(10,-8)
  112. def pnorm(z):  
  113.   t = 1/float(1+0.2316419*z) # t = 1/(1+pz) , z=0.2316419
  114.   pd = 1 - 0.3989423*exp(-z*z/2) * ((0.319381530*t)+ \
  115.                                     (-0.356563782*math.pow(t,2))+ \
  116.                                     (1.781477937*math.pow(t,3)) + \
  117.                                     (-1.821255978*math.pow(t,4)) + \
  118.                                     (1.330274429*math.pow(t,5)))
  119.   return pd
  120.  
  121. def calculateProbSigma(sent1,sent2,option="char"):
  122.   c = calculateMean(sent1,sent2,option) # c= numchar(sent1) / numchar(sent2)
  123.   s2 = calculateVariance(sent1,sent2,option) # s2 = variance
  124.  
  125.   len1 = calculateLength(sent1,option)
  126.   len2 = calculateLength(sent2,option)
  127.   mean = ( (len1 + len2)/c ) / 2
  128.   z = (c*len1 - len2) / math.pow((s2*mean), 0.5)
  129.   if (z<0): z = -1*z  # Just in case z is negative.
  130.   pd = 2 * (1-pnorm(z))
  131.   if (pd>0):
  132.     return -100 * math.log10(pd)
  133.   else:
  134.     return 2500
  135.  
  136. def getXthPlusYSentence(paragraph,x,y):
  137.   try:
  138.     return paragraph[x] + " " + paragraph[x+y]
  139.   except:
  140.     return paragraph[x]
  141.  
  142.  
  143.  
  144. ############################################################################
  145. '''sentence1 = "this is a sentence ."
  146. sentence2 = "c'est le sentence ."
  147. sentence3 = "whatever shit this is , it doesnt matches anyshit ."
  148. sentence4 = "this might be sentence ."
  149.  
  150. #print calculateProbSigma(sentence1, sentence2, "word")
  151. #print calculateProbSigma(sentence3, sentence2, "word")
  152. #print calculateProbSigma(sentence4, sentence2, "word")
  153.  
  154. print calculateLength(sentence1, "char")
  155. print calculateLength(sentence2, "char")
  156. print calculateProbSigma(sentence1, sentence2, "char")'''
  157.  
  158. #print calculateVariance('all.eng','all.jpn', 'word')
  159. means = []
  160. for x,y in izip(readFile('all.eng'),readFile('all.jpn')): # Reads each paragrah.
  161.   means.append(calculateMean(x,y))
  162.  
  163. print means
  164. print sum(means)/len(means)
  165.  
  166. #print calculateMean("srctext","trgtext")
  167. '''calculateVariance("all.eng","all.jpn","word")
  168. print
  169. calculateVariance("all.eng","all.ind","word")
  170. print
  171. calculateVariance("all.ind","all.jpn","word")'''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement