Advertisement
Guest User

Untitled

a guest
Mar 19th, 2018
70
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.27 KB | None | 0 0
  1. import re
  2. import nltk
  3. from nltk.stem.wordnet import WordNetLemmatizer
  4. from nltk.stem.snowball import SnowballStemmer
  5.  
  6. lmtzr = WordNetLemmatizer()
  7. stopwords = nltk.corpus.stopwords.words('english')
  8. stemmer = SnowballStemmer("english")
  9.  
  10. texts = [
  11.     "New Year's Eve in New York",
  12.     "New Year's Eve in London",
  13.     "York is closer to London than to New York",
  14.     "London is closer to Bucharest than to New York"
  15. ]
  16.  
  17. def tokenize_only(text):
  18.     # first tokenize by sentence, then by word to ensure that
  19.     # punctuation is caught as it's own token
  20.     tokens = [word.lower() for sent in nltk.sent_tokenize(text) \
  21.     for word in nltk.word_tokenize(sent)]
  22.     filtered_tokens = []
  23.     # filter out any tokens not containing letters
  24.     # (e.g., numeric tokens, raw punctuation)
  25.     for token in tokens:
  26.         if re.search('[a-zA-Z]', token):
  27.             filtered_tokens.append(token)
  28.     return filtered_tokens
  29.  
  30.  
  31. def remove_stopwords(tokens):
  32.     without_stopwords = []
  33.     for word in tokens:
  34.         if word not in stopwords:
  35.             without_stopwords.append(word)
  36.     return without_stopwords
  37.  
  38.  
  39. def tokenize_and_lemm(text):
  40.     tokens = tokenize_only(text)
  41.     tokens = remove_stopwords(tokens)
  42.     lemm_tokens = []
  43.     for token in tokens:
  44.         lemm_tokens.append(lmtzr.lemmatize(token))
  45.     return lemm_tokens
  46.  
  47.  
  48. def tokenize_and_stem(text):
  49.     tokens = tokenize_only(text)
  50.     tokens = remove_stopwords(tokens)
  51.     stem_tokens = []
  52.     for token in tokens:
  53.         stem_tokens.append(stemmer.stem(token))
  54.     return stem_tokens
  55.  
  56.  
  57.  
  58. tokenized_text = []
  59. tokenized_text_clear = []
  60. tokenized_lemmatized = []
  61. tokenized_stemmatized = []
  62.  
  63. for sentence in texts:
  64.     tokenized_text.append(tokenize_only(sentence))
  65.  
  66. print(tokenized_text) # with stopwords
  67.  
  68. print('#############')
  69.  
  70.  
  71. for token in tokenized_text:
  72.     tokenized_text_clear.append(remove_stopwords(token))
  73.    
  74. print(tokenized_text_clear) # without stopwords
  75.  
  76. print('#############')
  77.  
  78.  
  79. for sentence in texts:
  80.     tokenized_lemmatized.append(tokenize_and_lemm(sentence))
  81.  
  82. print(tokenized_lemmatized)
  83.  
  84. print('#############')
  85.  
  86.  
  87. for sentence in texts:
  88.     tokenized_stemmatized.append(tokenize_and_stem(sentence))
  89.  
  90. print(tokenized_stemmatized)
  91.  
  92. print('#############')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement