Advertisement
Guest User

Untitled

a guest
Jul 18th, 2019
127
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.29 KB | None | 0 0
  1. #This is the process i follow to clean tweets, from df[tweetText] which has the "dirty Tweets":
  2. #Pss 0 - I have in df['tweetText'] the dirty text
  3. text = " ".join(review for review in df['tweetText'])
  4.  
  5. print ("There are {} words in the combination of all review.".format(len(text)))
  6.  
  7. #Out solution: There are 4077075 words in the combination of all review.
  8. ##I USE TEXT IN WORDCLOUD TO GENERATE THE IMAGE WITH THE WORDS: https://amueller.github.io/word_cloud/
  9. #Pass 1 - Remove punctuation
  10. import string
  11.  
  12. string.punctuation
  13. #'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
  14. def remove_punct(text):
  15. text = "".join([char for char in text if char not in string.punctuation])
  16. text = re.sub('[0-9]+', '', text)
  17. return text
  18.  
  19. df['Tweet_punct'] = df['tweetText'].apply(lambda x: remove_punct(x))
  20. df.head(10)
  21.  
  22. #Pass 2 Tokenize into words
  23. def tokenization(text):
  24. text = re.split('\W+', text)
  25. return text
  26.  
  27. df['Tweet_tokenized'] = df['Tweet_punct'].apply(lambda x: tokenization(x.lower()))
  28. df.head()
  29.  
  30. #Pass 3 - Delete stopwords
  31. #I have in stopwordslist a stopwords some all languages ...taken from: https://pypi.org/project/stop-words/
  32. #3.1 - Create a stopword list with all the languages...
  33. from stop_words import get_stop_words
  34. stop_words_lang = ['Arabic','Bulgarian','ca','Czech','Danish','Dutch','en','Finnish','French','German','Hindi',\
  35. 'Hungarian','Indonesian','Italian','Norwegian','Norwegian','Polish','Portuguese','Romanian',\
  36. 'Russian','Slovak','es','Swedish','Turkish','Ukrainian','Vietnamese']
  37. stopwordslist = []
  38. stopwordslist.clear()
  39. #stopwords = set(STOPWORDS)
  40. for lang in stop_words_lang:
  41. stopwordslist.extend(get_stop_words(lang.lower()))
  42. #stopwords.update(get_stop_words(lang.lower()))
  43.  
  44. stopwordslist.sort()
  45. print(stopwordslist)
  46.  
  47. #3.2 - Create a list with only words that ARE NOT stopwords in either language...
  48. def stoppedWords(listWords):
  49. listWordsCleaned = []
  50. for word in listWords:
  51. if word not in stopwordslist:
  52. listWordsCleaned.append(word)
  53. return listWordsCleaned
  54.  
  55. df['Tweet_stopped'] = df['Tweet_tokenized'].apply(lambda listWords: stoppedWords(listWords))
  56. df.head()
  57.  
  58. #So here i have to count the words remaining ans has to be less than the one in pass 0. Should be a list of strings...
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement