View difference between Paste ID: 8aaDu74Z and 0dMTMv4e
SHOW: | | - or go back to the newest paste.
1
#!/usr/bin/env python
2
# -*- coding: utf-8 -*-
3
# ------------------------------------------------------------------------------
4
5
import sys
6
sys.dont_write_bytecode = True;
7
8
import subprocess
9
import re
10
from unicodedata import category
11
import os
12
import xsampa_helper
13
import operator
14
from collections import defaultdict
15
16
# ------------------------------------------------------------------------------
17
diphoneCount = defaultdict(int);
18
diphoneExamples = defaultdict(list);
19
def main():
20
	#if len(sys.argv) != 3:
21
	#	print('Error: Invalid commandline arguments.');
22
	#	print('Usage: python hinditranscribe.py <input_file> <output_file>');
23
	#	sys.exit(0);
24
    #path = 'lyrics/'
25
    #listing = os.listdir(path)
26
    
27
    #for infile in listing:
28
     #   print "current file is: " + infile
29
    fnInput = sys.argv[1];
30
    fnOutput = sys.argv[2];
31
    
32
    countdiphonefreqFile(fnInput, fnOutput);
33
    
34
35
36
def countdiphonefreqFile(fnInput, fnOutput):
37
    fhInput = open(fnInput, 'rt');
38
    fhOutput = open(fnOutput, 'wt');
39
    fhOutput2 = open('list.txt', 'wt');
40
    Num=0
41
    for line in fhInput:
42
        line = line.rstrip(); # remove trailing newline
43
        [allPhns, allPhnSyllIdxes, allPhnWordIdxes, allPhnStressLevels] = xsampa_helper.getAllPhnsInSentence(u'[Sil] ' + line + u' [Sil]');
44
        #print allPhnWordIdxes
45
        numSylls = allPhnSyllIdxes[-1] + 1;
46
        numSylls -= 2; # excluding leading / trailing [Sil]
47
        #print line
48
		# compute diphone frequency:
49
        diphoneBes = xsampa_helper.getAllArticulationsInSentence(allPhns, 2);
50
        #print diphoneBes
51-
        cunt = 0
51+
52
        #print wrds
53
        #print len(wrds)
54
55
        for be in diphoneBes:
56
            dphn = xsampa_helper.joinArt(allPhns, be);
57
            diphoneCount[dphn] += 1;
58
            #print dphn	
59
			# store upto first N example words/word pairs that contain the diphone:
60
            maxNumExamples = 8;
61
            #print dphn
62
            if len(diphoneExamples[dphn]) < maxNumExamples:
63
				#ort = u'# ' + ortho + u' #';
64
				#wrds = ort.split();
65
                
66
               # print wrds 
67
                widxes = allPhnWordIdxes[be[0]:be[1]]; # one index per phoneme in the diphone (may be same)
68
         #       print widxes
69
                widxesNoDupes = [];
70
                [widxesNoDupes.append(i) for i in widxes if not widxesNoDupes.count(i)]; # remove duplicates, but keep order
71
                #print widxesNoDupes
72
                #example = u' '.join([wrds[idx] for idx in widxesNoDupes and idx < len(wrds) ]);
73
                for idx in widxesNoDupes:
74
                    if idx<len(wrds):
75
                        example = u' '.join(wrds[idx])
76
                #print example
77
                if not example in diphoneExamples[dphn]:
78-
                cunt +=1
78+
79-
                #print cunt
79+
80
        #print 'abc'
81
    diphoneCountSorted = sorted(diphoneCount.iteritems(), key=operator.itemgetter(1), reverse=True)
82
    for countTuple in diphoneCountSorted:
83
        fhOutput.write(u'%s : %d\n' % (countTuple[0], countTuple[1]));
84
    
85
    for k in range(len(diphoneCountSorted)):
86
		countTuple = diphoneCountSorted[k];
87
		fhOutput2.write(u'%s : %d : %s\n \n' % (countTuple[0], countTuple[1], '; '.join(diphoneExamples[countTuple[0]])));
88
89
90
    #fhOutput.write(diphoneCount);
91
		#fhOutputOrtho.write(lineO + '\n');
92
93
    fhInput.close();
94
    fhOutput.close();
95
    
96
if __name__ == '__main__':
97
##	s = '-Y vosotros,';
98
##	print normalizeOrthography(s);
99
	main();