Advertisement
Guest User

Untitled

a guest
Jan 23rd, 2017
136
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.27 KB | None | 0 0
  1. class Doc2VecSentences(gensim.interfaces.CorpusABC):
  2.  
  3. def __init__(self, index, dictionary, max_documents=None):
  4. assert isinstance(index, pyndri.Index)
  5.  
  6. self.index = index
  7. self.dictionary = dictionary
  8.  
  9. self.max_documents = max_documents
  10.  
  11. def _maximum_document(self):
  12. if self.max_documents is None:
  13. return self.index.maximum_document()
  14. else:
  15. return min(
  16. self.max_documents + self.index.document_base(),
  17. self.index.maximum_document())
  18.  
  19. def __iter__(self):
  20. for int_doc_id in range(self.index.document_base(),
  21. self._maximum_document()):
  22. ext_doc_id, tokens = self.index.document(int_doc_id)
  23.  
  24. tokens = tuple(
  25. self.dictionary[token_id]
  26. for token_id in tokens
  27. if token_id > 0 and token_id in self.dictionary)
  28.  
  29. yield TaggedDocument(words=tokens, tags=[int_doc_id])
  30.  
  31. def __len__(self):
  32. return self._maximum_document() - self.index.document_base()
  33.  
  34. class LDALSISentences(gensim.interfaces.CorpusABC):
  35.  
  36. def __init__(self, index, dictionary, max_documents=None):
  37. assert isinstance(index, pyndri.Index)
  38.  
  39. self.index = index
  40. self.dictionary = dictionary
  41.  
  42. self.max_documents = max_documents
  43.  
  44. def _maximum_document(self):
  45. if self.max_documents is None:
  46. return self.index.maximum_document()
  47. else:
  48. return min(
  49. self.max_documents + self.index.document_base(),
  50. self.index.maximum_document())
  51.  
  52. def _doc2bow(self, doc):
  53.  
  54. di = collections.defaultdict(int)
  55.  
  56. for token_id in doc:
  57. di[token_id] += 1
  58.  
  59. return [(key, value) for value, key in di.items()]
  60.  
  61. def __iter__(self):
  62. for int_doc_id in range(self.index.document_base(),
  63. self._maximum_document()):
  64. ext_doc_id, tokens = self.index.document(int_doc_id)
  65.  
  66. tokens = tuple(
  67. token_id
  68. for token_id in tokens
  69. if token_id > 0 and token_id in self.dictionary)
  70.  
  71. yield self._doc2bow(tokens)
  72.  
  73. def __len__(self):
  74. return self._maximum_document() - self.index.document_base()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement