Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- class Doc2VecSentences(gensim.interfaces.CorpusABC):
- def __init__(self, index, dictionary, max_documents=None):
- assert isinstance(index, pyndri.Index)
- self.index = index
- self.dictionary = dictionary
- self.max_documents = max_documents
- def _maximum_document(self):
- if self.max_documents is None:
- return self.index.maximum_document()
- else:
- return min(
- self.max_documents + self.index.document_base(),
- self.index.maximum_document())
- def __iter__(self):
- for int_doc_id in range(self.index.document_base(),
- self._maximum_document()):
- ext_doc_id, tokens = self.index.document(int_doc_id)
- tokens = tuple(
- self.dictionary[token_id]
- for token_id in tokens
- if token_id > 0 and token_id in self.dictionary)
- yield TaggedDocument(words=tokens, tags=[int_doc_id])
- def __len__(self):
- return self._maximum_document() - self.index.document_base()
- class LDALSISentences(gensim.interfaces.CorpusABC):
- def __init__(self, index, dictionary, max_documents=None):
- assert isinstance(index, pyndri.Index)
- self.index = index
- self.dictionary = dictionary
- self.max_documents = max_documents
- def _maximum_document(self):
- if self.max_documents is None:
- return self.index.maximum_document()
- else:
- return min(
- self.max_documents + self.index.document_base(),
- self.index.maximum_document())
- def _doc2bow(self, doc):
- di = collections.defaultdict(int)
- for token_id in doc:
- di[token_id] += 1
- return [(key, value) for value, key in di.items()]
- def __iter__(self):
- for int_doc_id in range(self.index.document_base(),
- self._maximum_document()):
- ext_doc_id, tokens = self.index.document(int_doc_id)
- tokens = tuple(
- token_id
- for token_id in tokens
- if token_id > 0 and token_id in self.dictionary)
- yield self._doc2bow(tokens)
- def __len__(self):
- return self._maximum_document() - self.index.document_base()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement