Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import codecs, math
- from itertools import izip, izip_longest
- from scipy.integrate import quad
- import scipy.stats as stat
- import numpy as np
- from scipy import stats
- from numpy import arange,array,ones,linalg
- from pylab import plot,show
- from numpy import loadtxt, zeros, ones, array, linspace, logspace, cov
- from pylab import *
- # Reads file in alignment format, uses "#" to delimit paragraphs.
- def readFile(filename):
- reader = codecs.open(filename, "r","utf8").readlines()
- reader = reader[1:]
- text = [] ;paragraph = []
- for line in reader:
- if line.strip() == "#" or line[0] == "#":
- text.append(paragraph)
- paragraph = []
- continue
- else:
- paragraph.append(line.strip())
- return text
- # Calculate word/char length of text from the output of readFile().
- def calculateLength(text, option="char"):
- if option == "word":
- length = len(text.split(" "))
- else:
- length = len(text)
- return length
- '''def text2flat(text):
- txt = ""
- for p in text:
- txt+=" "; txt+=" ".join(p)
- return txt'''
- # Caluclate mean length: mean = len(text2) / len(txt1)
- def calculateMean(para1, para2, option='char'):
- if option == "gale-church":
- return int(1)
- src_len, trg_len = 0,0
- for i in para1:
- src_len+=calculateLength(i,option)
- for j in para2:
- trg_len+=calculateLength(j,option)
- c = trg_len/float(src_len)
- return c
- # Calculate covariance
- def calculateVariance(text1, text2, option='char'):
- try:
- src = readFile(text1); trg = readFile(text2)
- except:
- src = [text1]; trg = [text2]
- srcPara_lens = [] # x-axis
- diffsquares = [] # y-axis
- trgPara_lens = []
- for p in src:
- srcPara_lens.append(calculateLength(" ".join(p),option))
- for p in trg:
- trgPara_lens.append(calculateLength(" ".join(p),option))
- for ps, pt in izip(src, trg):
- #print calculateLength(" ".join(pt),"word"), calculateLength(" ".join(ps),"word")
- diff = (math.pow(calculateLength(" ".join(pt),option) - \
- calculateLength(" ".join(ps),option),2))
- diffsquares.append(diff)
- print srcPara_lens; print trgPara_lens; print diffsquares
- (m,b)=polyfit(srcPara_lens,diffsquares,1)
- print m; plotGraph(srcPara_lens,diffsquares)
- return m
- def plotGraph(xlist,ylist,label4x="x-axis",label4y="y-axis"):
- (m,b)=polyfit(xlist,ylist,1)
- yp=polyval([m,b],xlist)
- #print m, b
- plot(xlist,yp); scatter(xlist,ylist)
- grid(True)
- xlabel(label4x); ylabel(label4y); show()
- return None
- def list2data(xlist,ylist):
- arraylist = []
- for x,y in izip(xlist, ylist):
- arraylist.append([x,y])
- data = np.array(arraylist)
- return data
- def getCovarianceMatrix(xlist, ylist):
- arraylist = []
- for x,y in izip(xlist, ylist):
- arraylist.append([x,y])
- data = np.array(arraylist)
- covariance = np.cov(data.T)
- return covariance
- def calculateSigma(l1,l2,mean,variance):
- return (l2 - (l1*mean)) / math.pow(variance*l2,1/2)
- #def calculateProbSigma(sigma):
- def func(z):
- return exp(-1*math.pow(z,2)/2)
- # Equation 26.2.17 from Abramowitz and Stegun (1964:p.932) ,
- # error(x) is ignored cos negligible, |error(x)| < 7.5*math.pow(10,-8)
- def pnorm(z):
- t = 1/float(1+0.2316419*z) # t = 1/(1+pz) , z=0.2316419
- pd = 1 - 0.3989423*exp(-z*z/2) * ((0.319381530*t)+ \
- (-0.356563782*math.pow(t,2))+ \
- (1.781477937*math.pow(t,3)) + \
- (-1.821255978*math.pow(t,4)) + \
- (1.330274429*math.pow(t,5)))
- return pd
- def calculateProbSigma(sent1,sent2,option="char"):
- c = calculateMean(sent1,sent2,option) # c= numchar(sent1) / numchar(sent2)
- s2 = calculateVariance(sent1,sent2,option) # s2 = variance
- len1 = calculateLength(sent1,option)
- len2 = calculateLength(sent2,option)
- mean = ( (len1 + len2)/c ) / 2
- z = (c*len1 - len2) / math.pow((s2*mean), 0.5)
- if (z<0): z = -1*z # Just in case z is negative.
- pd = 2 * (1-pnorm(z))
- if (pd>0):
- return -100 * math.log10(pd)
- else:
- return 2500
- def getXthPlusYSentence(paragraph,x,y):
- try:
- return paragraph[x] + " " + paragraph[x+y]
- except:
- return paragraph[x]
- ############################################################################
- '''sentence1 = "this is a sentence ."
- sentence2 = "c'est le sentence ."
- sentence3 = "whatever shit this is , it doesnt matches anyshit ."
- sentence4 = "this might be sentence ."
- #print calculateProbSigma(sentence1, sentence2, "word")
- #print calculateProbSigma(sentence3, sentence2, "word")
- #print calculateProbSigma(sentence4, sentence2, "word")
- print calculateLength(sentence1, "char")
- print calculateLength(sentence2, "char")
- print calculateProbSigma(sentence1, sentence2, "char")'''
- #print calculateVariance('all.eng','all.jpn', 'word')
- means = []
- for x,y in izip(readFile('all.eng'),readFile('all.jpn')): # Reads each paragrah.
- means.append(calculateMean(x,y))
- print means
- print sum(means)/len(means)
- #print calculateMean("srctext","trgtext")
- '''calculateVariance("all.eng","all.jpn","word")
- print
- calculateVariance("all.eng","all.ind","word")
- print
- calculateVariance("all.ind","all.jpn","word")'''
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement