Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- stop_words = set(stopwords.words('english'))
- for i in range(len(df.words)) :
- s = df.words[i]
- # table = s.maketrans('', '', string.punctuation)
- # s = s.translate(table)
- # df.words[i] = s
- sentence = s.lower() # Converting to lowercase
- cleanr = re.compile('<.*?>')
- sentence = re.sub(cleanr, ' ', sentence) # Removing HTML tags
- sentence = re.sub(r'[?|!|\'|"|#]', r'', sentence)
- sentence = re.sub(r'[.|,|)|(|\|/]', r' ', sentence) # Removing Punctuations
- tokens = word_tokenize(sentence)
- result = [i for i in tokens if not i in stop_words]
- df.words[i] = ' '.join(result)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement