Advertisement
hb20007

Cleaning Tweets with NLTK

Mar 22nd, 2018
488
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
Python 2.71 KB | None | 0 0
  1. import re
  2. from string import punctuation
  3. from nltk.corpus import stopwords
  4. from nltk.tokenize import TweetTokenizer
  5.  
  6. punctuation += '΄´’…“”–—―»«' # string.punctuation misses these.
  7.  
  8. cache_english_stopwords = stopwords.words('english')
  9.  
  10. def tweet_clean(tweet):
  11.     print('Original tweet:', tweet, '\n')
  12.     # Remove HTML special entities (e.g. &)
  13.     tweet_no_special_entities = re.sub(r'\&\w*;', '', tweet)
  14.     print('No special entitites:', tweet_no_special_entities, '\n')
  15.     # Remove tickers
  16.     tweet_no_tickers = re.sub(r'\$\w*', '', tweet_no_special_entities)
  17.     print('No tickers:', tweet_no_tickers, '\n')
  18.     # Remove hyperlinks
  19.     tweet_no_hyperlinks = re.sub(r'https?:\/\/.*\/\w*', '', tweet_no_tickers)
  20.     print('No hyperlinks:', tweet_no_hyperlinks, '\n')
  21.     # Remove hashtags
  22.     tweet_no_hashtags = re.sub(r'#\w*', '', tweet_no_hyperlinks)
  23.     print('No hashtags:', tweet_no_hashtags, '\n')
  24.     # Remove Punctuation and split 's, 't, 've with a space for filter
  25.     tweet_no_punctuation = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet_no_hashtags)
  26.     print('No punctuation:', tweet_no_punctuation, '\n')
  27.     # Remove words with 2 or fewer letters
  28.     tweet_no_small_words = re.sub(r'\b\w{1,2}\b', '', tweet_no_punctuation)
  29.     print('No small words:', tweet_no_small_words, '\n')
  30.     # Remove whitespace (including new line characters)
  31.     tweet_no_whitespace = re.sub(r'\s\s+', ' ', tweet_no_small_words)
  32.     tweet_no_whitespace = tweet_no_whitespace.lstrip(' ') # Remove single space remaining at the front of the tweet.
  33.     print('No whitespace:', tweet_no_whitespace, '\n')
  34.     # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode:
  35.     tweet_no_emojis = ''.join(c for c in tweet_no_whitespace if c <= '\uFFFF') # Apart from emojis (plane 1), this also removes historic scripts and mathematical alphanumerics (also plane 1), ideographs (plane 2) and more.
  36.     print('No emojis:', tweet_no_emojis, '\n')
  37.     # Tokenize: Change to lowercase, reduce length and remove handles
  38.     tknzr = TweetTokenizer(preserve_case=False, reduce_len=True, strip_handles=True) # reduce_len changes, for example, waaaaaayyyy to waaayyy.
  39.     tw_list = tknzr.tokenize(tweet_no_emojis)
  40.     print('Tweet tokenize:', tw_list, '\n')
  41.     # Remove stopwords
  42.     list_no_stopwords = [i for i in tw_list if i not in cache_english_stopwords]
  43.     print('No stop words:', list_no_stopwords, '\n')
  44.     # Final filtered tweet
  45.     tweet_filtered =' '.join(list_no_stopwords)
  46.     print('Final tweet:', tweet_filtered)
  47.  
  48. s = '    RT @Amila #Test\nTom\'s newly listed Co. &amp; Mary\'s unlisted     Group to supply tech for nlTK.\nh.. $TSLA $AAPL https:// t.co/x34afsfQsh'
  49. tweet_clean(s)
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement