View difference between Paste ID: <a href="/8aaDu74Z">8aaDu74Z</a> and <a href="/0dMTMv4e">0dMTMv4e</a>

#!/usr/bin/env python
1		#!/usr/bin/env python
2		# -- coding: utf-8 --
3		# ------------------------------------------------------------------------------
4
5		import sys
6		sys.dont_write_bytecode = True;
7
8		import subprocess
9		import re
10		from unicodedata import category
11		import os
12		import xsampa_helper
13		import operator
14		from collections import defaultdict
15
16		# ------------------------------------------------------------------------------
17		diphoneCount = defaultdict(int);
18		diphoneExamples = defaultdict(list);
19		def main():
20		#if len(sys.argv) != 3:
21		# print('Error: Invalid commandline arguments.');
22		# print('Usage: python hinditranscribe.py <input_file> <output_file>');
23		# sys.exit(0);
24		#path = 'lyrics/'
25		#listing = os.listdir(path)
26
27		#for infile in listing:
28		# print "current file is: " + infile
29		fnInput = sys.argv[1];
30		fnOutput = sys.argv[2];
31
32		countdiphonefreqFile(fnInput, fnOutput);
33
34
35
36		def countdiphonefreqFile(fnInput, fnOutput):
37		fhInput = open(fnInput, 'rt');
38		fhOutput = open(fnOutput, 'wt');
39		fhOutput2 = open('list.txt', 'wt');
40		Num=0
41		for line in fhInput:
42		line = line.rstrip(); # remove trailing newline
43		[allPhns, allPhnSyllIdxes, allPhnWordIdxes, allPhnStressLevels] = xsampa_helper.getAllPhnsInSentence(u'[Sil] ' + line + u' [Sil]');
44		#print allPhnWordIdxes
45		numSylls = allPhnSyllIdxes[-1] + 1;
46		numSylls -= 2; # excluding leading / trailing [Sil]
47		#print line
48		# compute diphone frequency:
49		diphoneBes = xsampa_helper.getAllArticulationsInSentence(allPhns, 2);
50		#print diphoneBes
51	-	cunt = 0
51	+
52		#print wrds
53		#print len(wrds)
54
55		for be in diphoneBes:
56		dphn = xsampa_helper.joinArt(allPhns, be);
57		diphoneCount[dphn] += 1;
58		#print dphn
59		# store upto first N example words/word pairs that contain the diphone:
60		maxNumExamples = 8;
61		#print dphn
62		if len(diphoneExamples[dphn]) < maxNumExamples:
63		#ort = u'# ' + ortho + u' #';
64		#wrds = ort.split();
65
66		# print wrds
67		widxes = allPhnWordIdxes[be[0]:be[1]]; # one index per phoneme in the diphone (may be same)
68		# print widxes
69		widxesNoDupes = [];
70		[widxesNoDupes.append(i) for i in widxes if not widxesNoDupes.count(i)]; # remove duplicates, but keep order
71		#print widxesNoDupes
72		#example = u' '.join([wrds[idx] for idx in widxesNoDupes and idx < len(wrds) ]);
73		for idx in widxesNoDupes:
74		if idx<len(wrds):
75		example = u' '.join(wrds[idx])
76		#print example
77		if not example in diphoneExamples[dphn]:
78	-	cunt +=1
78	+
79	-	#print cunt
79	+
80		#print 'abc'
81		diphoneCountSorted = sorted(diphoneCount.iteritems(), key=operator.itemgetter(1), reverse=True)
82		for countTuple in diphoneCountSorted:
83		fhOutput.write(u'%s : %d\n' % (countTuple[0], countTuple[1]));
84
85		for k in range(len(diphoneCountSorted)):
86		countTuple = diphoneCountSorted[k];
87		fhOutput2.write(u'%s : %d : %s\n \n' % (countTuple[0], countTuple[1], '; '.join(diphoneExamples[countTuple[0]])));
88
89
90		#fhOutput.write(diphoneCount);
91		#fhOutputOrtho.write(lineO + '\n');
92
93		fhInput.close();
94		fhOutput.close();
95
96		if __name__ == '__main__':
97		## s = '-Y vosotros,';
98		## print normalizeOrthography(s);
99		main();