diphone_frequency.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# ------------------------------------------------------------------------------

import sys
sys.dont_write_bytecode = True;

import subprocess
import re
from unicodedata import category
import os
import xsampa_helper
import operator
from collections import defaultdict

# ------------------------------------------------------------------------------
diphoneCount = defaultdict(int);
diphoneExamples = defaultdict(list);
def main():
    #if len(sys.argv) != 3:
    #   print('Error: Invalid commandline arguments.');
    #   print('Usage: python hinditranscribe.py <input_file> <output_file>');
    #   sys.exit(0);
    #path = 'lyrics/'
    #listing = os.listdir(path)

    #for infile in listing:
     #   print "current file is: " + infile
    fnInput = sys.argv[1];
    fnOutput = sys.argv[2];

    countdiphonefreqFile(fnInput, fnOutput);


def countdiphonefreqFile(fnInput, fnOutput):
    fhInput = open(fnInput, 'rt');
    fhOutput = open(fnOutput, 'wt');
    fhOutput2 = open('list.txt', 'wt');
    Num=0
    for line in fhInput:
        line = line.rstrip(); # remove trailing newline
        [allPhns, allPhnSyllIdxes, allPhnWordIdxes, allPhnStressLevels] = xsampa_helper.getAllPhnsInSentence(u'[Sil] ' + line + u' [Sil]');
        #print allPhnWordIdxes
        numSylls = allPhnSyllIdxes[-1] + 1;
        numSylls -= 2; # excluding leading / trailing [Sil]
        #print line
        # compute diphone frequency:
        diphoneBes = xsampa_helper.getAllArticulationsInSentence(allPhns, 2);
        #print diphoneBes
        wrds = xsampa_helper.splitSentence2Words(line)
        #print wrds
        #print len(wrds)

        for be in diphoneBes:
            dphn = xsampa_helper.joinArt(allPhns, be);
            diphoneCount[dphn] += 1;
            #print dphn
            # store upto first N example words/word pairs that contain the diphone:
            maxNumExamples = 8;
            #print dphn
            if len(diphoneExamples[dphn]) < maxNumExamples:
                #ort = u'# ' + ortho + u' #';
                #wrds = ort.split();

               # print wrds
                widxes = allPhnWordIdxes[be[0]:be[1]]; # one index per phoneme in the diphone (may be same)
         #       print widxes
                widxesNoDupes = [];
                [widxesNoDupes.append(i) for i in widxes if not widxesNoDupes.count(i)]; # remove duplicates, but keep order
                #print widxesNoDupes
                #example = u' '.join([wrds[idx] for idx in widxesNoDupes and idx < len(wrds) ]);
                for idx in widxesNoDupes:
                    if idx<len(wrds):
                        example = u' '.join(wrds[idx])
                #print example
                if not example in diphoneExamples[dphn]:
                    diphoneExamples[dphn].append(example);

        #print 'abc'
    diphoneCountSorted = sorted(diphoneCount.iteritems(), key=operator.itemgetter(1), reverse=True)
    for countTuple in diphoneCountSorted:
        fhOutput.write(u'%s : %d\n' % (countTuple[0], countTuple[1]));

    for k in range(len(diphoneCountSorted)):
        countTuple = diphoneCountSorted[k];
        fhOutput2.write(u'%s : %d : %s\n \n' % (countTuple[0], countTuple[1], '; '.join(diphoneExamples[countTuple[0]])));


    #fhOutput.write(diphoneCount);
        #fhOutputOrtho.write(lineO + '\n');

    fhInput.close();
    fhOutput.close();

if __name__ == '__main__':
##  s = '-Y vosotros,';
##  print normalizeOrthography(s);
    main();