Advertisement
Guest User

Untitled

a guest
Jun 20th, 2019
106
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.61 KB | None | 0 0
  1. df.info()
  2. <class 'dask.dataframe.core.DataFrame'>
  3. Columns: 5 entries, claim_no to litigation
  4. dtypes: object(2), int64(3)
  5.  
  6. claim_no claim_txt I CL ICC lit
  7. 0 8697278-17 battery comprising interior battery active ele... 106 2 0
  8.  
  9. >>tagged_document[0]
  10. >>TaggedDocument(words=['battery', 'comprising', 'interior', 'battery', 'active', 'elements', 'battery', 'cell', 'casing', 'said', 'cell', 'casing', 'comprising', 'first', 'casing', 'element', 'first', 'contact', 'surface', 'second', 'casing', 'element', 'second', 'contact', 'surface', 'wherein', 'assembled', 'position', 'first', 'second', 'contact', 'surfaces', 'contact', 'first', 'second', 'casing', 'elements', 'encase', 'active', 'materials', 'battery', 'cell', 'interior', 'space', 'wherein', 'least', 'one', 'gas', 'tight', 'seal', 'layer', 'arranged', 'first', 'second', 'contact', 'surfaces', 'seal', 'interior', 'space', 'characterized', 'one', 'first', 'second', 'contact', 'surfaces', 'comprises', 'electrically', 'insulating', 'void', 'volume', 'layer', 'first', 'second', 'contact', 'surfaces', 'comprises', 'formable', 'material', 'layer', 'fills', 'voids', 'surface', 'void', 'volume', 'layer', 'hermetically', 'assembled', 'position', 'form', 'seal', 'layer'], tags=['8697278-17'])
  11. >>len(tagged_document) == len(df['claim_txt'])
  12.  
  13. def read_corpus_tag_sub(df,corp='claim_txt',tags=['claim_no']):
  14. for i, line in enumerate(df[corp]):
  15. yield gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), (list(df.loc[i,tags].values)))
  16.  
  17. tagged_document = df.map_partitions(read_corpus_tag_sub,meta=TaggedDocument)
  18. tagged_document = tagged_document.compute()
  19.  
  20. def read_corpus_tag_sub(df,corp='claim_txt',tags=['claim_no']):
  21. for i, line in enumerate(df[corp]):
  22. return gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), (list(df.loc[i,tags].values)))
  23.  
  24. tagged_document = df.map_partitions(read_corpus_tag_sub,meta=TaggedDocument)
  25. tagged_document = tagged_document.compute()
  26.  
  27. def read_corpus_tag_sub(df,corp='claim_txt',tags=['claim_no']):
  28. tagged_list = []
  29. for i, line in enumerate(df[corp]):
  30. tagged = gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(line), (list(df.loc[i,tags].values)))
  31. tagged_list.append(tagged)
  32. return tagged_list
  33.  
  34. def tag_corp(corp,tag):
  35. return gensim.models.doc2vec.TaggedDocument(gensim.utils.simple_preprocess(corp), ([tag]))
  36.  
  37. tagged_document = [tag_corp(x,y) for x,y in list(zip(df_smple['claim_txt'],df_smple['claim_no']))]
  38.  
  39. tagged_document = list(read_corpus_tag_sub(df))
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement