Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- import nltk
- from nltk.stem.wordnet import WordNetLemmatizer
- from nltk.stem.snowball import SnowballStemmer
- lmtzr = WordNetLemmatizer()
- stopwords = nltk.corpus.stopwords.words('english')
- stemmer = SnowballStemmer("english")
- texts = [
- "New Year's Eve in New York",
- "New Year's Eve in London",
- "York is closer to London than to New York",
- "London is closer to Bucharest than to New York"
- ]
- def tokenize_only(text):
- # first tokenize by sentence, then by word to ensure that
- # punctuation is caught as it's own token
- tokens = [word.lower() for sent in nltk.sent_tokenize(text) \
- for word in nltk.word_tokenize(sent)]
- filtered_tokens = []
- # filter out any tokens not containing letters
- # (e.g., numeric tokens, raw punctuation)
- for token in tokens:
- if re.search('[a-zA-Z]', token):
- filtered_tokens.append(token)
- return filtered_tokens
- def remove_stopwords(tokens):
- without_stopwords = []
- for word in tokens:
- if word not in stopwords:
- without_stopwords.append(word)
- return without_stopwords
- def tokenize_and_lemm(text):
- tokens = tokenize_only(text)
- tokens = remove_stopwords(tokens)
- lemm_tokens = []
- for token in tokens:
- lemm_tokens.append(lmtzr.lemmatize(token))
- return lemm_tokens
- def tokenize_and_stem(text):
- tokens = tokenize_only(text)
- tokens = remove_stopwords(tokens)
- stem_tokens = []
- for token in tokens:
- stem_tokens.append(stemmer.stem(token))
- return stem_tokens
- tokenized_text = []
- tokenized_text_clear = []
- tokenized_lemmatized = []
- tokenized_stemmatized = []
- for sentence in texts:
- tokenized_text.append(tokenize_only(sentence))
- print(tokenized_text) # with stopwords
- print('#############')
- for token in tokenized_text:
- tokenized_text_clear.append(remove_stopwords(token))
- print(tokenized_text_clear) # without stopwords
- print('#############')
- for sentence in texts:
- tokenized_lemmatized.append(tokenize_and_lemm(sentence))
- print(tokenized_lemmatized)
- print('#############')
- for sentence in texts:
- tokenized_stemmatized.append(tokenize_and_stem(sentence))
- print(tokenized_stemmatized)
- print('#############')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement