SHARE
TWEET

python lexeme_match.py

a guest Jul 12th, 2014 197 Never
  1. from optparse import OptionParser
  2. import sys
  3. import codecs
  4. import shlex
  5.  
  6. parser = OptionParser()
  7.  
  8. # Input files
  9. parser.add_option("-f", "--freq-file", dest="freq_file", default=None,
  10.                   help="Frequency list (Tab delimited, UTF-8)", metavar="FILE")
  11. parser.add_option("-m", "--match-file", dest="match_file",
  12.                   help="Lexemes to match (Tab delimited, UTF-8)", metavar="FILE")
  13.  
  14. # Column names
  15. parser.add_option("-l", dest="lexeme_column",
  16.                   help="Lexeme column", default='Standard (Newspaper) Orthography')
  17. parser.add_option("-r", dest="rank_column",
  18.                   help="Rank column", default='Word Ranking for General Learners')
  19.  
  20. class Lexeme:
  21.     def __init__(self, lexeme, rank):
  22.         self.lexeme = lexeme
  23.         self.rank = rank
  24.  
  25. def process(options, args):
  26.     if not options.freq_file:
  27.         print >> sys.stderr, 'Missing Frequency file'
  28.         parser.print_help()
  29.         sys.exit(1)
  30.     if not options.match_file:
  31.         print >> sys.stderr, 'Missing match file'
  32.         parser.print_help()
  33.         sys.exit(1)
  34.     suw_fp = codecs.open(options.freq_file, "rb", encoding="utf-8")
  35.  
  36.     print "Using lexeme DB from '%s'" % options.freq_file
  37.     print "Matching lexemes in '%s'" % options.match_file
  38.     print "Using '%s' lexemes" % options.lexeme_column
  39.     print "Using '%s' ranks" % options.rank_column
  40.    
  41.     WORD_RANGE_SIZE = 5000
  42.     MAX_WORD_RANGES = 10
  43.  
  44.     lexemes = {}
  45.     readings = {}
  46.     wordRanks = [0] * MAX_WORD_RANGES
  47.     lexeme_col = None
  48.     rank_col = None
  49.     n = 0
  50.     with suw_fp:
  51.         # Read line
  52.         while True:
  53.             line = ''
  54.             # Deal with quoted multi line fields
  55.             while True:
  56.                 newline = suw_fp.readline()
  57.                 if not newline:
  58.                     break
  59.                 line += newline
  60.                 # Stop if quotes are balanced
  61.                 if (line.count('"') % 2) == 0:
  62.                     break
  63.             if line == '':
  64.                 break
  65.  
  66.             # Split into fields
  67.             cells = line.replace('"', '').split('\t')
  68.             #print n, cells
  69.             if n == 0:
  70.                 # Find column numbers in header row
  71.                 for i, c in enumerate(cells):
  72.                     if lexeme_col == None and options.lexeme_column in c:
  73.                         lexeme_col = i
  74.                     if rank_col == None and options.rank_column in c:
  75.                         rank_col = i
  76.             else:
  77.                 # Read lexeme
  78.                 lexeme = cells[lexeme_col]
  79.                 rank = cells[rank_col]
  80.                 if (rank != '') and (lexeme not in lexemes):
  81.                     lexemes[lexeme] = Lexeme(lexeme, float(rank))
  82.             n += 1
  83.             if n > 200000:
  84.                 break
  85.  
  86.     print "Loaded %d lexemes" % (n - 1)
  87.  
  88.     dn_fp = codecs.open(options.match_file, "rb", encoding="utf-8-sig")
  89.     dn_matches = [x.split('\t') for x in dn_fp.readlines()]
  90.     dn_fp.close()
  91.  
  92.     missing = 0
  93.     missing_tok = 0
  94.     total_tok = 0
  95.     for i, m in enumerate(dn_matches):
  96.         if len(m) < 1:
  97.             continue
  98.         if len(m) == 2:
  99.             num_tokens = int(m[0].strip())
  100.             morph = m[1].strip()
  101.         else:
  102.             num_tokens = 1
  103.             morph = m[0].strip()
  104.         total_tok += num_tokens
  105.         if morph not in lexemes:
  106.             missing += 1
  107.             missing_tok += num_tokens
  108.         else:
  109.             lex = lexemes[morph]
  110.             level = (int(lex.rank) - 1)  / WORD_RANGE_SIZE
  111.             if level > MAX_WORD_RANGES - 1:
  112.                 level = MAX_WORD_RANGES - 1
  113.                 #print "Rare", m[0]
  114.             wordRanks[level] += num_tokens
  115.  
  116.     print "Lexemes matched:", len(dn_matches) - missing, "Missing", missing
  117.     print "Tokens matched: ", total_tok - missing_tok, "Missing", missing_tok
  118.     print "Coverage of found tokens"
  119.     runningTotal = 0
  120.     print "          Range   Match   Tot       %"
  121.     for i, r in enumerate(wordRanks):
  122.         runningTotal += r
  123.         if i < MAX_WORD_RANGES-1:
  124.             print "%6d ~ %6d :" % (i * WORD_RANGE_SIZE + 1, i * WORD_RANGE_SIZE + WORD_RANGE_SIZE),
  125.         else:
  126.             print "%6d ~ %6d :" % (i * WORD_RANGE_SIZE + 1, n-1),
  127.         print "%5d" % r, "%5d" % runningTotal, "%6.2f%%" % (100.0 * float(runningTotal) / (total_tok - missing_tok))
  128.  
  129. if __name__ == "__main__":
  130.     (options, args) = parser.parse_args()
  131.     process(options, args)
  132.  
  133. # Example options:
  134. #   -f "C:\temp\VDRJ_Ver1_1_Research_Top60894.csv"
  135. #   -m "C:\temp\deathnote-morphemes.tsv"
  136. #   -r "Word Ranking for General Learners"
RAW Paste Data
Top