Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # -*- coding: utf-8 -*-
- # ------------------------------------------------------------------------------
- import sys
- sys.dont_write_bytecode = True;
- import subprocess
- import re
- from unicodedata import category
- import os
- import xsampa_helper
- import operator
- from collections import defaultdict
- # ------------------------------------------------------------------------------
- diphoneCount = defaultdict(int);
- diphoneExamples = defaultdict(list);
- def main():
- #if len(sys.argv) != 3:
- # print('Error: Invalid commandline arguments.');
- # print('Usage: python hinditranscribe.py <input_file> <output_file>');
- # sys.exit(0);
- #path = 'lyrics/'
- #listing = os.listdir(path)
- #for infile in listing:
- # print "current file is: " + infile
- fnInput = sys.argv[1];
- fnOutput = sys.argv[2];
- countdiphonefreqFile(fnInput, fnOutput);
- def countdiphonefreqFile(fnInput, fnOutput):
- fhInput = open(fnInput, 'rt');
- fhOutput = open(fnOutput, 'wt');
- fhOutput2 = open('list.txt', 'wt');
- Num=0
- for line in fhInput:
- line = line.rstrip(); # remove trailing newline
- [allPhns, allPhnSyllIdxes, allPhnWordIdxes, allPhnStressLevels] = xsampa_helper.getAllPhnsInSentence(u'[Sil] ' + line + u' [Sil]');
- #print allPhnWordIdxes
- numSylls = allPhnSyllIdxes[-1] + 1;
- numSylls -= 2; # excluding leading / trailing [Sil]
- #print line
- # compute diphone frequency:
- diphoneBes = xsampa_helper.getAllArticulationsInSentence(allPhns, 2);
- #print diphoneBes
- wrds = xsampa_helper.splitSentence2Words(line)
- #print wrds
- #print len(wrds)
- for be in diphoneBes:
- dphn = xsampa_helper.joinArt(allPhns, be);
- diphoneCount[dphn] += 1;
- #print dphn
- # store upto first N example words/word pairs that contain the diphone:
- maxNumExamples = 8;
- #print dphn
- if len(diphoneExamples[dphn]) < maxNumExamples:
- #ort = u'# ' + ortho + u' #';
- #wrds = ort.split();
- # print wrds
- widxes = allPhnWordIdxes[be[0]:be[1]]; # one index per phoneme in the diphone (may be same)
- # print widxes
- widxesNoDupes = [];
- [widxesNoDupes.append(i) for i in widxes if not widxesNoDupes.count(i)]; # remove duplicates, but keep order
- #print widxesNoDupes
- #example = u' '.join([wrds[idx] for idx in widxesNoDupes and idx < len(wrds) ]);
- for idx in widxesNoDupes:
- if idx<len(wrds):
- example = u' '.join(wrds[idx])
- #print example
- if not example in diphoneExamples[dphn]:
- diphoneExamples[dphn].append(example);
- #print 'abc'
- diphoneCountSorted = sorted(diphoneCount.iteritems(), key=operator.itemgetter(1), reverse=True)
- for countTuple in diphoneCountSorted:
- fhOutput.write(u'%s : %d\n' % (countTuple[0], countTuple[1]));
- for k in range(len(diphoneCountSorted)):
- countTuple = diphoneCountSorted[k];
- fhOutput2.write(u'%s : %d : %s\n \n' % (countTuple[0], countTuple[1], '; '.join(diphoneExamples[countTuple[0]])));
- #fhOutput.write(diphoneCount);
- #fhOutputOrtho.write(lineO + '\n');
- fhInput.close();
- fhOutput.close();
- if __name__ == '__main__':
- ## s = '-Y vosotros,';
- ## print normalizeOrthography(s);
- main();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement