SHARE
TWEET

read_conll_kalimat

rifalpg Jun 7th, 2019 18 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. def read_conll_kalimat(fh, language=None, maxSize=-1, hard_lim=False, vocab_prep=False, drop_nproj=False):
  2.     # hard lim means capping the corpus size across the whole training procedure
  3.     # soft lim means using a sample of the whole corpus at each epoch
  4.     #fh = codecs.open(filename,'r',encoding='utf-8')
  5.     print "read..."
  6.     if vocab_prep and not hard_lim:
  7.         maxSize = -1 # when preparing the vocab with a soft limit we need to use the whole corpus
  8.     ts = time.time()
  9.     dropped = 0
  10.     read = 0
  11.     root = ConllEntry(0, '*root*', '*root*', 'ROOT-POS', 'ROOT-CPOS', '_', -1, 'rroot', '_', '_')
  12.     root.language_id = language
  13.     tokens = [root]
  14.     yield_count = 0
  15.     if maxSize > 0 and not hard_lim:
  16.         all_tokens = []
  17.     for line in fh:
  18.         tok = line.strip().split('\t')
  19.         if not tok or line.strip() == '':
  20.             if len(tokens)>1:
  21.                 conll_tokens = [t for t in tokens if isinstance(t,ConllEntry)]
  22.                 if not drop_nproj or isProj(conll_tokens): # keep going if it's projective or we're not dropping non-projective sents
  23.                 #dropping the proj for exploring swap
  24.                 #if not isProj([t for t in tokens if isinstance(t, ConllEntry)]):
  25.                     inorder_tokens = inorder(conll_tokens)
  26.                     for i,t in enumerate(inorder_tokens):
  27.                         t.projective_order = i
  28.                     for tok in conll_tokens:
  29.                         tok.rdeps = [i.id for i in conll_tokens if i.parent_id == tok.id]
  30.                         if tok.id != 0:
  31.                             tok.parent_entry = [i for i in conll_tokens if i.id == tok.parent_id][0]
  32.                     if maxSize > 0:
  33.                         if not hard_lim:
  34.                             all_tokens.append(tokens)
  35.                         elif yield_count < maxSize:
  36.                             yield tokens
  37.                             yield_count += 1
  38.                         else:
  39.                             print "Capping size of corpus at " + str(yield_count) + " sentences"
  40.                             break;
  41.                     else:
  42.                         yield tokens
  43.                 else:
  44.                     #print 'Non-projective sentence dropped'
  45.                     dropped += 1
  46.                 read += 1
  47.             tokens = [root]
  48.         else:
  49.             if line[0] == '#' or '-' in tok[0] or '.' in tok[0]:
  50.                 tokens.append(line.strip())
  51.             else:
  52.                 token = ConllEntry(int(tok[0]), tok[1], tok[2], tok[4], tok[3], tok[5], int(tok[6]) if tok[6] != '_' else -1, tok[7], tok[8], tok[9])
  53.                 token.language_id = language
  54.                 tokens.append(token)
  55.     if hard_lim and yield_count < maxSize:
  56.         print 'Warning: unable to yield ' + str(maxSize) + ' sentences, only ' + str(yield_count) + ' found'
  57.  
  58. # TODO: deal with case where there are still unyielded tokens
  59. # e.g. when there is no newline at end of file
  60. #    if len(tokens) > 1:
  61. #        yield tokens
  62.  
  63.     print read, 'sentences read'
  64.  
  65.     if maxSize > 0 and not hard_lim:
  66.         random.shuffle(all_tokens)
  67.         all_tokens = all_tokens[:maxSize]
  68.         print "Yielding " + str(len(all_tokens)) + " random sentences"
  69.         for toks in all_tokens:
  70.             yield toks
  71.  
  72.     te = time.time()
  73.     print 'Time: %.2gs'%(te-ts)
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
Top