Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from gensim.models import Phrases
- documents = ["the mayor of new york was there", "machine learning can be useful sometimes","new york mayor was present"]
- sentence_stream = [doc.split(" ") for doc in documents]
- bigram = Phrases(sentence_stream, min_count=1, threshold=2)
- sent = [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there']
- print(bigram[sent])
- [u'the', u'mayor', u'of', u'new_york', u'was', u'there']
- # read txt documents
- os.chdir('text_data')
- documents = []
- for file in glob.glob("*.txt"): # read all txt files in working directory
- file_content = open(file, "r")
- lines = file_content.read().splitlines()
- for line in lines:
- documents.append(line)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement