Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- def read_conll_kalimat(fh, language=None, maxSize=-1, hard_lim=False, vocab_prep=False, drop_nproj=False):
- # hard lim means capping the corpus size across the whole training procedure
- # soft lim means using a sample of the whole corpus at each epoch
- #fh = codecs.open(filename,'r',encoding='utf-8')
- print "read..."
- if vocab_prep and not hard_lim:
- maxSize = -1 # when preparing the vocab with a soft limit we need to use the whole corpus
- ts = time.time()
- dropped = 0
- read = 0
- root = ConllEntry(0, '*root*', '*root*', 'ROOT-POS', 'ROOT-CPOS', '_', -1, 'rroot', '_', '_')
- root.language_id = language
- tokens = [root]
- yield_count = 0
- if maxSize > 0 and not hard_lim:
- all_tokens = []
- for line in fh:
- tok = line.strip().split('\t')
- if not tok or line.strip() == '':
- if len(tokens)>1:
- conll_tokens = [t for t in tokens if isinstance(t,ConllEntry)]
- if not drop_nproj or isProj(conll_tokens): # keep going if it's projective or we're not dropping non-projective sents
- #dropping the proj for exploring swap
- #if not isProj([t for t in tokens if isinstance(t, ConllEntry)]):
- inorder_tokens = inorder(conll_tokens)
- for i,t in enumerate(inorder_tokens):
- t.projective_order = i
- for tok in conll_tokens:
- tok.rdeps = [i.id for i in conll_tokens if i.parent_id == tok.id]
- if tok.id != 0:
- tok.parent_entry = [i for i in conll_tokens if i.id == tok.parent_id][0]
- if maxSize > 0:
- if not hard_lim:
- all_tokens.append(tokens)
- elif yield_count < maxSize:
- yield tokens
- yield_count += 1
- else:
- print "Capping size of corpus at " + str(yield_count) + " sentences"
- break;
- else:
- yield tokens
- else:
- #print 'Non-projective sentence dropped'
- dropped += 1
- read += 1
- tokens = [root]
- else:
- if line[0] == '#' or '-' in tok[0] or '.' in tok[0]:
- tokens.append(line.strip())
- else:
- token = ConllEntry(int(tok[0]), tok[1], tok[2], tok[4], tok[3], tok[5], int(tok[6]) if tok[6] != '_' else -1, tok[7], tok[8], tok[9])
- token.language_id = language
- tokens.append(token)
- if hard_lim and yield_count < maxSize:
- print 'Warning: unable to yield ' + str(maxSize) + ' sentences, only ' + str(yield_count) + ' found'
- # TODO: deal with case where there are still unyielded tokens
- # e.g. when there is no newline at end of file
- # if len(tokens) > 1:
- # yield tokens
- print read, 'sentences read'
- if maxSize > 0 and not hard_lim:
- random.shuffle(all_tokens)
- all_tokens = all_tokens[:maxSize]
- print "Yielding " + str(len(all_tokens)) + " random sentences"
- for toks in all_tokens:
- yield toks
- te = time.time()
- print 'Time: %.2gs'%(te-ts)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement