SHARE
TWEET

lexeme_match.py

a guest Jul 10th, 2014 215 Never
  1. from optparse import OptionParser
  2. import sys
  3. import codecs
  4. import shlex
  5.  
  6. parser = OptionParser()
  7.  
  8. # Input files
  9. parser.add_option("-f", "--freq-file", dest="freq_file", default=None,
  10.                   help="Frequency list (Tab delimited, UTF-8)", metavar="FILE")
  11. parser.add_option("-m", "--match-file", dest="match_file",
  12.                   help="Lexemes to match (Tab delimited, UTF-8)", metavar="FILE")
  13.  
  14. # Column names
  15. parser.add_option("-l", dest="lexeme_column",
  16.                   help="Lexeme column", default='Standard (Newspaper) Orthography')
  17. parser.add_option("-r", dest="rank_column",
  18.                   help="Rank column", default='Word Ranking for General Learners')
  19.  
  20. class Lexeme:
  21.     def __init__(self, lexeme, rank):
  22.         self.lexeme = lexeme
  23.         self.rank = rank
  24.  
  25. def process(options, args):
  26.     if not options.freq_file:
  27.         print >> sys.stderr, 'Missing Frequency file'
  28.         parser.print_help()
  29.         sys.exit(1)
  30.     if not options.match_file:
  31.         print >> sys.stderr, 'Missing match file'
  32.         parser.print_help()
  33.         sys.exit(1)
  34.     suw_fp = codecs.open(options.freq_file, "rb", encoding="utf-8")
  35.  
  36.     print "Using lexeme DB from '%s'" % options.freq_file
  37.     print "Matching lexemes in '%s'" % options.match_file
  38.     print "Using '%s' lexemes" % options.lexeme_column
  39.     print "Using '%s' ranks" % options.rank_column
  40.    
  41.     WORD_RANGE_SIZE = 5000
  42.     MAX_WORD_RANGES = 10
  43.  
  44.     lexemes = {}
  45.     readings = {}
  46.     wordRanks = [0] * MAX_WORD_RANGES
  47.     lexeme_col = None
  48.     rank_col = None
  49.     n = 0
  50.     with suw_fp:
  51.         # Read line
  52.         while True:
  53.             line = ''
  54.             # Deal with quoted multi line fields
  55.             while True:
  56.                 newline = suw_fp.readline()
  57.                 if not newline:
  58.                     break
  59.                 line += newline
  60.                 # Stop if quotes are balanced
  61.                 if (line.count('"') % 2) == 0:
  62.                     break
  63.             if line == '':
  64.                 break
  65.  
  66.             # Split into fields
  67.             cells = line.replace('"', '').split('\t')
  68.             #print n, cells
  69.             if n == 0:
  70.                 # Find column numbers in header row
  71.                 for i, c in enumerate(cells):
  72.                     if lexeme_col == None and options.lexeme_column in c:
  73.                         lexeme_col = i
  74.                     if rank_col == None and options.rank_column in c:
  75.                         rank_col = i
  76.             else:
  77.                 # Read lexeme
  78.                 lexeme = cells[lexeme_col]
  79.                 rank = cells[rank_col]
  80.                 if (rank != '') and (lexeme not in lexemes):
  81.                     lexemes[lexeme] = Lexeme(lexeme, float(rank))
  82.             n += 1
  83.             if n > 200000:
  84.                 break
  85.  
  86.     print "Loaded %d lexemes" % (n - 1)
  87.  
  88.     dn_fp = codecs.open(options.match_file, "rb", encoding="utf-8")
  89.     dn_matches = [x.strip().split('\t') for x in dn_fp.readlines()]
  90.     dn_fp.close()
  91.  
  92.     missing = 0
  93.     for m in dn_matches:
  94.         if len(m) < 1:
  95.             continue
  96.         if m[0] not in lexemes:
  97.             missing += 1
  98.         else:
  99.             lex = lexemes[m[0]]
  100.             level = (int(lex.rank) - 1)  / WORD_RANGE_SIZE
  101.             if level > MAX_WORD_RANGES - 1:
  102.                 level = MAX_WORD_RANGES - 1
  103.                 #print "Rare", m[0]
  104.             wordRanks[level] += 1
  105.  
  106.     print "Found", len(dn_matches) - missing, "Missing", missing
  107.     print "Coverage of found lexemes"
  108.     runningTotal = 0
  109.     print "          Range   Match   Tot       %"
  110.     for i, r in enumerate(wordRanks):
  111.         runningTotal += r
  112.         if i < MAX_WORD_RANGES-1:
  113.             print "%6d ~ %6d :" % (i * WORD_RANGE_SIZE + 1, i * WORD_RANGE_SIZE + WORD_RANGE_SIZE),
  114.         else:
  115.             print "%6d ~ %6d :" % (i * WORD_RANGE_SIZE + 1, n-1),
  116.         print "%5d" % r, "%5d" % runningTotal, "%6.2f%%" % (100.0 * float(runningTotal) / (len(dn_matches) - missing))
  117.  
  118. if __name__ == "__main__":
  119.     (options, args) = parser.parse_args()
  120.     process(options, args)
  121.  
  122. # Example options:
  123. #   -f "C:\temp\VDRJ_Ver1_1_Research_Top60894.csv"
  124. #   -m "C:\temp\deathnote-morphemes.tsv"
  125. #   -r "Word Ranking for General Learners"
RAW Paste Data
Top