Untitled

#!/usr/bin/env python
# coding: utf-8

# In[5]:


import locale
import pandas as pd

locale.setlocale(locale.LC_ALL,'es_ES.UTF-8')

pd.set_option('display.float_format', lambda x: locale.format_string('%.0f', x, grouping=True))

df = pd.read_csv('twitter_cleanedsample.csv')

df.head()


# ## Text before cleaning anything

# In[13]:


text = " ".join(review for review in df['tweetText'])

print ("There are {} words in the combination of all review.".format(len(text)))


# ## Begin cleaning

# In[7]:


import string
import re
string.punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~'

def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

df['Tweet_punct'] = df['tweetText'].apply(lambda x: remove_punct(x))
df.head(10)


# In[8]:


def tokenization(text):
    text = re.split('\W+', text)
    return text

df['Tweet_tokenized'] = df['Tweet_punct'].apply(lambda x: tokenization(x.lower()))
df.head()


# In[9]:


from stop_words import get_stop_words

stop_words_lang = ['Arabic','Bulgarian','ca','Czech','Danish','Dutch','en','Finnish','French','German','Hindi',                   'Hungarian','Indonesian','Italian','Norwegian','Norwegian','Polish','Portuguese','Romanian',                   'Russian','Slovak','es','Swedish','Turkish','Ukrainian','Vietnamese']
stopwordslist = []
stopwordslist.clear()
#stopwords = set(STOPWORDS)
for lang in stop_words_lang:
    stopwordslist.extend(get_stop_words(lang.lower()))
    #stopwords.update(get_stop_words(lang.lower()))

stopwordslist.sort()
print(stopwordslist)


# In[10]:


def stoppedWords(listWords):
    listWordsCleaned = []
    for word in listWords:
        if word not in stopwordslist:
            listWordsCleaned.append(word)
    return listWordsCleaned

df['Tweet_stopped'] = df['Tweet_tokenized'].apply(lambda listWords: stoppedWords(listWords))
df.head()


# ## Result counting text with all the words ( should be less than without)

# In[21]:


all_words = []
#text = " ".join(review for review in df['tweetText'])

for review in df['Tweet_stopped']:
    print(review)
    for word in review:
        textcleaned = " ".join(word)
        print("Text cleanded" + textcleaned)


print ("Found {} words.".format(len(textcleaned)))
#resul


# In[ ]: