Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #This is the process i follow to clean tweets, from df[tweetText] which has the "dirty Tweets":
- #Pss 0 - I have in df['tweetText'] the dirty text
- text = " ".join(review for review in df['tweetText'])
- print ("There are {} words in the combination of all review.".format(len(text)))
- #Out solution: There are 4077075 words in the combination of all review.
- ##I USE TEXT IN WORDCLOUD TO GENERATE THE IMAGE WITH THE WORDS: https://amueller.github.io/word_cloud/
- #Pass 1 - Remove punctuation
- import string
- string.punctuation
- #'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
- def remove_punct(text):
- text = "".join([char for char in text if char not in string.punctuation])
- text = re.sub('[0-9]+', '', text)
- return text
- df['Tweet_punct'] = df['tweetText'].apply(lambda x: remove_punct(x))
- df.head(10)
- #Pass 2 Tokenize into words
- def tokenization(text):
- text = re.split('\W+', text)
- return text
- df['Tweet_tokenized'] = df['Tweet_punct'].apply(lambda x: tokenization(x.lower()))
- df.head()
- #Pass 3 - Delete stopwords
- #I have in stopwordslist a stopwords some all languages ...taken from: https://pypi.org/project/stop-words/
- #3.1 - Create a stopword list with all the languages...
- from stop_words import get_stop_words
- stop_words_lang = ['Arabic','Bulgarian','ca','Czech','Danish','Dutch','en','Finnish','French','German','Hindi',\
- 'Hungarian','Indonesian','Italian','Norwegian','Norwegian','Polish','Portuguese','Romanian',\
- 'Russian','Slovak','es','Swedish','Turkish','Ukrainian','Vietnamese']
- stopwordslist = []
- stopwordslist.clear()
- #stopwords = set(STOPWORDS)
- for lang in stop_words_lang:
- stopwordslist.extend(get_stop_words(lang.lower()))
- #stopwords.update(get_stop_words(lang.lower()))
- stopwordslist.sort()
- print(stopwordslist)
- #3.2 - Create a list with only words that ARE NOT stopwords in either language...
- def stoppedWords(listWords):
- listWordsCleaned = []
- for word in listWords:
- if word not in stopwordslist:
- listWordsCleaned.append(word)
- return listWordsCleaned
- df['Tweet_stopped'] = df['Tweet_tokenized'].apply(lambda listWords: stoppedWords(listWords))
- df.head()
- #So here i have to count the words remaining ans has to be less than the one in pass 0. Should be a list of strings...
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement