Advertisement
neo01124

diphone_frequency.py

Nov 28th, 2012
62
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-
  3. # ------------------------------------------------------------------------------
  4.  
  5. import sys
  6. sys.dont_write_bytecode = True;
  7.  
  8. import subprocess
  9. import re
  10. from unicodedata import category
  11. import os
  12. import xsampa_helper
  13. import operator
  14. from collections import defaultdict
  15.  
  16. # ------------------------------------------------------------------------------
  17. diphoneCount = defaultdict(int);
  18. diphoneExamples = defaultdict(list);
  19. def main():
  20.     #if len(sys.argv) != 3:
  21.     #   print('Error: Invalid commandline arguments.');
  22.     #   print('Usage: python hinditranscribe.py <input_file> <output_file>');
  23.     #   sys.exit(0);
  24.     #path = 'lyrics/'
  25.     #listing = os.listdir(path)
  26.    
  27.     #for infile in listing:
  28.      #   print "current file is: " + infile
  29.     fnInput = sys.argv[1];
  30.     fnOutput = sys.argv[2];
  31.    
  32.     countdiphonefreqFile(fnInput, fnOutput);
  33.    
  34.  
  35.  
  36. def countdiphonefreqFile(fnInput, fnOutput):
  37.     fhInput = open(fnInput, 'rt');
  38.     fhOutput = open(fnOutput, 'wt');
  39.     fhOutput2 = open('list.txt', 'wt');
  40.     Num=0
  41.     for line in fhInput:
  42.         line = line.rstrip(); # remove trailing newline
  43.         [allPhns, allPhnSyllIdxes, allPhnWordIdxes, allPhnStressLevels] = xsampa_helper.getAllPhnsInSentence(u'[Sil] ' + line + u' [Sil]');
  44.         #print allPhnWordIdxes
  45.         numSylls = allPhnSyllIdxes[-1] + 1;
  46.         numSylls -= 2; # excluding leading / trailing [Sil]
  47.         #print line
  48.         # compute diphone frequency:
  49.         diphoneBes = xsampa_helper.getAllArticulationsInSentence(allPhns, 2);
  50.         #print diphoneBes
  51.         wrds = xsampa_helper.splitSentence2Words(line)
  52.         #print wrds
  53.         #print len(wrds)
  54.  
  55.         for be in diphoneBes:
  56.             dphn = xsampa_helper.joinArt(allPhns, be);
  57.             diphoneCount[dphn] += 1;
  58.             #print dphn
  59.             # store upto first N example words/word pairs that contain the diphone:
  60.             maxNumExamples = 8;
  61.             #print dphn
  62.             if len(diphoneExamples[dphn]) < maxNumExamples:
  63.                 #ort = u'# ' + ortho + u' #';
  64.                 #wrds = ort.split();
  65.                
  66.                # print wrds
  67.                 widxes = allPhnWordIdxes[be[0]:be[1]]; # one index per phoneme in the diphone (may be same)
  68.          #       print widxes
  69.                 widxesNoDupes = [];
  70.                 [widxesNoDupes.append(i) for i in widxes if not widxesNoDupes.count(i)]; # remove duplicates, but keep order
  71.                 #print widxesNoDupes
  72.                 #example = u' '.join([wrds[idx] for idx in widxesNoDupes and idx < len(wrds) ]);
  73.                 for idx in widxesNoDupes:
  74.                     if idx<len(wrds):
  75.                         example = u' '.join(wrds[idx])
  76.                 #print example
  77.                 if not example in diphoneExamples[dphn]:
  78.                     diphoneExamples[dphn].append(example);
  79.  
  80.         #print 'abc'
  81.     diphoneCountSorted = sorted(diphoneCount.iteritems(), key=operator.itemgetter(1), reverse=True)
  82.     for countTuple in diphoneCountSorted:
  83.         fhOutput.write(u'%s : %d\n' % (countTuple[0], countTuple[1]));
  84.    
  85.     for k in range(len(diphoneCountSorted)):
  86.         countTuple = diphoneCountSorted[k];
  87.         fhOutput2.write(u'%s : %d : %s\n \n' % (countTuple[0], countTuple[1], '; '.join(diphoneExamples[countTuple[0]])));
  88.  
  89.  
  90.     #fhOutput.write(diphoneCount);
  91.         #fhOutputOrtho.write(lineO + '\n');
  92.  
  93.     fhInput.close();
  94.     fhOutput.close();
  95.    
  96. if __name__ == '__main__':
  97. ##  s = '-Y vosotros,';
  98. ##  print normalizeOrthography(s);
  99.     main();
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement