read_conll_kalimat

def read_conll_kalimat(fh, language=None, maxSize=-1, hard_lim=False, vocab_prep=False, drop_nproj=False):
    # hard lim means capping the corpus size across the whole training procedure
    # soft lim means using a sample of the whole corpus at each epoch
    #fh = codecs.open(filename,'r',encoding='utf-8')
    print "read..."
    if vocab_prep and not hard_lim:
        maxSize = -1 # when preparing the vocab with a soft limit we need to use the whole corpus
    ts = time.time()
    dropped = 0
    read = 0
    root = ConllEntry(0, '*root*', '*root*', 'ROOT-POS', 'ROOT-CPOS', '_', -1, 'rroot', '_', '_')
    root.language_id = language
    tokens = [root]
    yield_count = 0
    if maxSize > 0 and not hard_lim:
        all_tokens = []
    for line in fh:
        tok = line.strip().split('\t')
        if not tok or line.strip() == '':
            if len(tokens)>1:
                conll_tokens = [t for t in tokens if isinstance(t,ConllEntry)]
                if not drop_nproj or isProj(conll_tokens): # keep going if it's projective or we're not dropping non-projective sents
                #dropping the proj for exploring swap
                #if not isProj([t for t in tokens if isinstance(t, ConllEntry)]):
                    inorder_tokens = inorder(conll_tokens)
                    for i,t in enumerate(inorder_tokens):
                        t.projective_order = i
                    for tok in conll_tokens:
                        tok.rdeps = [i.id for i in conll_tokens if i.parent_id == tok.id]
                        if tok.id != 0:
                            tok.parent_entry = [i for i in conll_tokens if i.id == tok.parent_id][0]
                    if maxSize > 0:
                        if not hard_lim:
                            all_tokens.append(tokens)
                        elif yield_count < maxSize:
                            yield tokens
                            yield_count += 1
                        else:
                            print "Capping size of corpus at " + str(yield_count) + " sentences"
                            break;
                    else:
                        yield tokens
                else:
                    #print 'Non-projective sentence dropped'
                    dropped += 1
                read += 1
            tokens = [root]
        else:
            if line[0] == '#' or '-' in tok[0] or '.' in tok[0]:
                tokens.append(line.strip())
            else:
                token = ConllEntry(int(tok[0]), tok[1], tok[2], tok[4], tok[3], tok[5], int(tok[6]) if tok[6] != '_' else -1, tok[7], tok[8], tok[9])
                token.language_id = language
                tokens.append(token)
    if hard_lim and yield_count < maxSize:
        print 'Warning: unable to yield ' + str(maxSize) + ' sentences, only ' + str(yield_count) + ' found'

# TODO: deal with case where there are still unyielded tokens
# e.g. when there is no newline at end of file
#    if len(tokens) > 1:
#        yield tokens

    print read, 'sentences read'

    if maxSize > 0 and not hard_lim:
        random.shuffle(all_tokens)
        all_tokens = all_tokens[:maxSize]
        print "Yielding " + str(len(all_tokens)) + " random sentences"
        for toks in all_tokens:
            yield toks

    te = time.time()
    print 'Time: %.2gs'%(te-ts)