SHARE
TWEET
python lexeme_match.py
a guest
Jul 12th, 2014
197
Never
- from optparse import OptionParser
- import sys
- import codecs
- import shlex
- parser = OptionParser()
- # Input files
- parser.add_option("-f", "--freq-file", dest="freq_file", default=None,
- help="Frequency list (Tab delimited, UTF-8)", metavar="FILE")
- parser.add_option("-m", "--match-file", dest="match_file",
- help="Lexemes to match (Tab delimited, UTF-8)", metavar="FILE")
- # Column names
- parser.add_option("-l", dest="lexeme_column",
- help="Lexeme column", default='Standard (Newspaper) Orthography')
- parser.add_option("-r", dest="rank_column",
- help="Rank column", default='Word Ranking for General Learners')
- class Lexeme:
- def __init__(self, lexeme, rank):
- self.lexeme = lexeme
- self.rank = rank
- def process(options, args):
- if not options.freq_file:
- print >> sys.stderr, 'Missing Frequency file'
- parser.print_help()
- sys.exit(1)
- if not options.match_file:
- print >> sys.stderr, 'Missing match file'
- parser.print_help()
- sys.exit(1)
- suw_fp = codecs.open(options.freq_file, "rb", encoding="utf-8")
- print "Using lexeme DB from '%s'" % options.freq_file
- print "Matching lexemes in '%s'" % options.match_file
- print "Using '%s' lexemes" % options.lexeme_column
- print "Using '%s' ranks" % options.rank_column
- WORD_RANGE_SIZE = 5000
- MAX_WORD_RANGES = 10
- lexemes = {}
- readings = {}
- wordRanks = [0] * MAX_WORD_RANGES
- lexeme_col = None
- rank_col = None
- n = 0
- with suw_fp:
- # Read line
- while True:
- line = ''
- # Deal with quoted multi line fields
- while True:
- newline = suw_fp.readline()
- if not newline:
- break
- line += newline
- # Stop if quotes are balanced
- if (line.count('"') % 2) == 0:
- break
- if line == '':
- break
- # Split into fields
- cells = line.replace('"', '').split('\t')
- #print n, cells
- if n == 0:
- # Find column numbers in header row
- for i, c in enumerate(cells):
- if lexeme_col == None and options.lexeme_column in c:
- lexeme_col = i
- if rank_col == None and options.rank_column in c:
- rank_col = i
- else:
- # Read lexeme
- lexeme = cells[lexeme_col]
- rank = cells[rank_col]
- if (rank != '') and (lexeme not in lexemes):
- lexemes[lexeme] = Lexeme(lexeme, float(rank))
- n += 1
- if n > 200000:
- break
- print "Loaded %d lexemes" % (n - 1)
- dn_fp = codecs.open(options.match_file, "rb", encoding="utf-8-sig")
- dn_matches = [x.split('\t') for x in dn_fp.readlines()]
- dn_fp.close()
- missing = 0
- missing_tok = 0
- total_tok = 0
- for i, m in enumerate(dn_matches):
- if len(m) < 1:
- continue
- if len(m) == 2:
- num_tokens = int(m[0].strip())
- morph = m[1].strip()
- else:
- num_tokens = 1
- morph = m[0].strip()
- total_tok += num_tokens
- if morph not in lexemes:
- missing += 1
- missing_tok += num_tokens
- else:
- lex = lexemes[morph]
- level = (int(lex.rank) - 1) / WORD_RANGE_SIZE
- if level > MAX_WORD_RANGES - 1:
- level = MAX_WORD_RANGES - 1
- #print "Rare", m[0]
- wordRanks[level] += num_tokens
- print "Lexemes matched:", len(dn_matches) - missing, "Missing", missing
- print "Tokens matched: ", total_tok - missing_tok, "Missing", missing_tok
- print "Coverage of found tokens"
- runningTotal = 0
- print " Range Match Tot %"
- for i, r in enumerate(wordRanks):
- runningTotal += r
- if i < MAX_WORD_RANGES-1:
- print "%6d ~ %6d :" % (i * WORD_RANGE_SIZE + 1, i * WORD_RANGE_SIZE + WORD_RANGE_SIZE),
- else:
- print "%6d ~ %6d :" % (i * WORD_RANGE_SIZE + 1, n-1),
- print "%5d" % r, "%5d" % runningTotal, "%6.2f%%" % (100.0 * float(runningTotal) / (total_tok - missing_tok))
- if __name__ == "__main__":
- (options, args) = parser.parse_args()
- process(options, args)
- # Example options:
- # -f "C:\temp\VDRJ_Ver1_1_Research_Top60894.csv"
- # -m "C:\temp\deathnote-morphemes.tsv"
- # -r "Word Ranking for General Learners"
RAW Paste Data
