Advertisement
Guest User

Untitled

a guest
Sep 19th, 2019
164
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.47 KB | None | 0 0
  1. # import nltk
  2. # from nltk import word_tokenize
  3. # from nltk.util import ngrams
  4. # from collections import Counter
  5.  
  6. # text = "I need to write a program in NLTK that breaks a corpus (a large collection of txt files) into unigrams, bigrams, trigrams, fourgrams and fivegrams.I need to write a program in NLTK that breaks a corpus"
  7. # token = nltk.word_tokenize(text)
  8. # bigrams = ngrams(token,2)
  9. # trigrams = ngrams(token,3)
  10.  
  11. # print(Counter(bigrams))
  12.  
  13.  
  14. from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
  15. import pandas as pd
  16.  
  17. def wm2df(wm, feat_names):
  18.  
  19. # create an index for each row
  20. doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
  21. df = pd.DataFrame(data=wm.toarray(), index=doc_names,
  22. columns=feat_names)
  23. return(df)
  24.  
  25. arr = ["Car was cleaned by Jack","Jack was cleaned by the Car.", "I? am? felling good", "okay!!"]
  26.  
  27. # vectorizer = TfidfVectorizer(ngram_range=(2,2))
  28. vectorizer = CountVectorizer(lowercase=False,ngram_range=(1,3),tokenizer=lambda x: x.split(' '))
  29.  
  30. X = vectorizer.fit_transform(arr)
  31.  
  32. tokens = vectorizer.get_feature_names()
  33. print(tokens)
  34.  
  35. print(X.toarray())
  36.  
  37. # print(wm2df(X, tokens))
  38.  
  39.  
  40. from stanfordcorenlp import StanfordCoreNLP
  41.  
  42. nlp = StanfordCoreNLP(r'stanford-corenlp-full-2016-10-31')
  43.  
  44. sentence = 'Guangdong University of Foreign Studies is located in Guangzhou.'
  45.  
  46. print('Dependency Parsing:', nlp.dependency_parse(sentence))
  47.  
  48. nlp.close() # Do not forget to close! The backend server will consume a lot memery.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement