Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # import nltk
- # from nltk import word_tokenize
- # from nltk.util import ngrams
- # from collections import Counter
- # text = "I need to write a program in NLTK that breaks a corpus (a large collection of txt files) into unigrams, bigrams, trigrams, fourgrams and fivegrams.I need to write a program in NLTK that breaks a corpus"
- # token = nltk.word_tokenize(text)
- # bigrams = ngrams(token,2)
- # trigrams = ngrams(token,3)
- # print(Counter(bigrams))
- from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
- import pandas as pd
- def wm2df(wm, feat_names):
- # create an index for each row
- doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
- df = pd.DataFrame(data=wm.toarray(), index=doc_names,
- columns=feat_names)
- return(df)
- arr = ["Car was cleaned by Jack","Jack was cleaned by the Car.", "I? am? felling good", "okay!!"]
- # vectorizer = TfidfVectorizer(ngram_range=(2,2))
- vectorizer = CountVectorizer(lowercase=False,ngram_range=(1,3),tokenizer=lambda x: x.split(' '))
- X = vectorizer.fit_transform(arr)
- tokens = vectorizer.get_feature_names()
- print(tokens)
- print(X.toarray())
- # print(wm2df(X, tokens))
- from stanfordcorenlp import StanfordCoreNLP
- nlp = StanfordCoreNLP(r'stanford-corenlp-full-2016-10-31')
- sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.'
- print('Dependency Parsing:', nlp.dependency_parse(sentence))
- nlp.close() # Do not forget to close! The backend server will consume a lot memery.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement