SHARE
TWEET

Untitled

a guest Sep 19th, 2019 96 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. # import nltk
  2. # from nltk import word_tokenize
  3. # from nltk.util import ngrams
  4. # from collections import Counter
  5.  
  6. # text = "I need to write a program in NLTK that breaks a corpus (a large collection of txt files) into unigrams, bigrams, trigrams, fourgrams and fivegrams.I need to write a program in NLTK that breaks a corpus"
  7. # token = nltk.word_tokenize(text)
  8. # bigrams = ngrams(token,2)
  9. # trigrams = ngrams(token,3)
  10.  
  11. # print(Counter(bigrams))
  12.  
  13.  
  14. from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
  15. import pandas as pd
  16.  
  17. def wm2df(wm, feat_names):
  18.    
  19.     # create an index for each row
  20.     doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
  21.     df = pd.DataFrame(data=wm.toarray(), index=doc_names,
  22.                       columns=feat_names)
  23.     return(df)
  24.  
  25. arr = ["Car was cleaned by Jack","Jack was cleaned by the Car.", "I? am? felling good", "okay!!"]
  26.  
  27. # vectorizer = TfidfVectorizer(ngram_range=(2,2))
  28. vectorizer = CountVectorizer(lowercase=False,ngram_range=(1,3),tokenizer=lambda x: x.split(' '))
  29.  
  30. X = vectorizer.fit_transform(arr)
  31.  
  32. tokens = vectorizer.get_feature_names()
  33. print(tokens)
  34.  
  35. print(X.toarray())
  36.  
  37. # print(wm2df(X, tokens))
  38.  
  39.  
  40. from stanfordcorenlp import StanfordCoreNLP
  41.  
  42. nlp = StanfordCoreNLP(r'stanford-corenlp-full-2016-10-31')
  43.  
  44. sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.'
  45.  
  46. print('Dependency Parsing:', nlp.dependency_parse(sentence))
  47.  
  48. nlp.close() # Do not forget to close! The backend server will consume a lot memery.
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top