SHOW:
|
|
- or go back to the newest paste.
1 | #!/usr/bin/env python | |
2 | # -*- coding: utf-8 -*- | |
3 | # ------------------------------------------------------------------------------ | |
4 | ||
5 | import sys | |
6 | sys.dont_write_bytecode = True; | |
7 | ||
8 | import subprocess | |
9 | import re | |
10 | from unicodedata import category | |
11 | import os | |
12 | import xsampa_helper | |
13 | import operator | |
14 | from collections import defaultdict | |
15 | ||
16 | # ------------------------------------------------------------------------------ | |
17 | diphoneCount = defaultdict(int); | |
18 | diphoneExamples = defaultdict(list); | |
19 | def main(): | |
20 | #if len(sys.argv) != 3: | |
21 | # print('Error: Invalid commandline arguments.'); | |
22 | # print('Usage: python hinditranscribe.py <input_file> <output_file>'); | |
23 | # sys.exit(0); | |
24 | #path = 'lyrics/' | |
25 | #listing = os.listdir(path) | |
26 | ||
27 | #for infile in listing: | |
28 | # print "current file is: " + infile | |
29 | fnInput = sys.argv[1]; | |
30 | fnOutput = sys.argv[2]; | |
31 | ||
32 | countdiphonefreqFile(fnInput, fnOutput); | |
33 | ||
34 | ||
35 | ||
36 | def countdiphonefreqFile(fnInput, fnOutput): | |
37 | fhInput = open(fnInput, 'rt'); | |
38 | fhOutput = open(fnOutput, 'wt'); | |
39 | fhOutput2 = open('list.txt', 'wt'); | |
40 | Num=0 | |
41 | for line in fhInput: | |
42 | line = line.rstrip(); # remove trailing newline | |
43 | [allPhns, allPhnSyllIdxes, allPhnWordIdxes, allPhnStressLevels] = xsampa_helper.getAllPhnsInSentence(u'[Sil] ' + line + u' [Sil]'); | |
44 | #print allPhnWordIdxes | |
45 | numSylls = allPhnSyllIdxes[-1] + 1; | |
46 | numSylls -= 2; # excluding leading / trailing [Sil] | |
47 | #print line | |
48 | # compute diphone frequency: | |
49 | diphoneBes = xsampa_helper.getAllArticulationsInSentence(allPhns, 2); | |
50 | #print diphoneBes | |
51 | - | cunt = 0 |
51 | + | |
52 | #print wrds | |
53 | #print len(wrds) | |
54 | ||
55 | for be in diphoneBes: | |
56 | dphn = xsampa_helper.joinArt(allPhns, be); | |
57 | diphoneCount[dphn] += 1; | |
58 | #print dphn | |
59 | # store upto first N example words/word pairs that contain the diphone: | |
60 | maxNumExamples = 8; | |
61 | #print dphn | |
62 | if len(diphoneExamples[dphn]) < maxNumExamples: | |
63 | #ort = u'# ' + ortho + u' #'; | |
64 | #wrds = ort.split(); | |
65 | ||
66 | # print wrds | |
67 | widxes = allPhnWordIdxes[be[0]:be[1]]; # one index per phoneme in the diphone (may be same) | |
68 | # print widxes | |
69 | widxesNoDupes = []; | |
70 | [widxesNoDupes.append(i) for i in widxes if not widxesNoDupes.count(i)]; # remove duplicates, but keep order | |
71 | #print widxesNoDupes | |
72 | #example = u' '.join([wrds[idx] for idx in widxesNoDupes and idx < len(wrds) ]); | |
73 | for idx in widxesNoDupes: | |
74 | if idx<len(wrds): | |
75 | example = u' '.join(wrds[idx]) | |
76 | #print example | |
77 | if not example in diphoneExamples[dphn]: | |
78 | - | cunt +=1 |
78 | + | |
79 | - | #print cunt |
79 | + | |
80 | #print 'abc' | |
81 | diphoneCountSorted = sorted(diphoneCount.iteritems(), key=operator.itemgetter(1), reverse=True) | |
82 | for countTuple in diphoneCountSorted: | |
83 | fhOutput.write(u'%s : %d\n' % (countTuple[0], countTuple[1])); | |
84 | ||
85 | for k in range(len(diphoneCountSorted)): | |
86 | countTuple = diphoneCountSorted[k]; | |
87 | fhOutput2.write(u'%s : %d : %s\n \n' % (countTuple[0], countTuple[1], '; '.join(diphoneExamples[countTuple[0]]))); | |
88 | ||
89 | ||
90 | #fhOutput.write(diphoneCount); | |
91 | #fhOutputOrtho.write(lineO + '\n'); | |
92 | ||
93 | fhInput.close(); | |
94 | fhOutput.close(); | |
95 | ||
96 | if __name__ == '__main__': | |
97 | ## s = '-Y vosotros,'; | |
98 | ## print normalizeOrthography(s); | |
99 | main(); |