Untitled

#This is the process i follow to clean tweets, from df[tweetText] which has the "dirty Tweets":
#Pss 0 - I have in df['tweetText'] the dirty text
text = " ".join(review for review in df['tweetText'])

print ("There are {} words in the combination of all review.".format(len(text)))

#Out solution: There are 4077075 words in the combination of all review.
##I USE TEXT IN WORDCLOUD TO GENERATE THE IMAGE WITH THE WORDS: https://amueller.github.io/word_cloud/
#Pass 1 - Remove punctuation
import string

string.punctuation
#'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

df['Tweet_punct'] = df['tweetText'].apply(lambda x: remove_punct(x))
df.head(10)

#Pass 2 Tokenize into words
def tokenization(text):
    text = re.split('\W+', text)
    return text

df['Tweet_tokenized'] = df['Tweet_punct'].apply(lambda x: tokenization(x.lower()))
df.head()

#Pass 3  - Delete stopwords
#I have in stopwordslist a stopwords some all languages ...taken from: https://pypi.org/project/stop-words/
#3.1 - Create a stopword list with all the languages...
from stop_words import get_stop_words
stop_words_lang = ['Arabic','Bulgarian','ca','Czech','Danish','Dutch','en','Finnish','French','German','Hindi',\
                   'Hungarian','Indonesian','Italian','Norwegian','Norwegian','Polish','Portuguese','Romanian',\
                   'Russian','Slovak','es','Swedish','Turkish','Ukrainian','Vietnamese']
stopwordslist = []
stopwordslist.clear()
#stopwords = set(STOPWORDS)
for lang in stop_words_lang:
    stopwordslist.extend(get_stop_words(lang.lower()))
    #stopwords.update(get_stop_words(lang.lower()))

stopwordslist.sort()
print(stopwordslist)

#3.2 - Create a list with only words that ARE NOT stopwords in either language...
def stoppedWords(listWords):
    listWordsCleaned = []
    for word in listWords:
        if word not in stopwordslist:
            listWordsCleaned.append(word)
    return listWordsCleaned

df['Tweet_stopped'] = df['Tweet_tokenized'].apply(lambda listWords: stoppedWords(listWords))
df.head()

#So here i have to count the words remaining ans has to be less than the one in pass 0. Should be a list of strings...