Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- #!/usr/bin/env python
- # coding: utf-8
- # In[5]:
- import locale
- import pandas as pd
- locale.setlocale(locale.LC_ALL,'es_ES.UTF-8')
- pd.set_option('display.float_format', lambda x: locale.format_string('%.0f', x, grouping=True))
- df = pd.read_csv('twitter_cleanedsample.csv')
- df.head()
- # ## Text before cleaning anything
- # In[13]:
- text = " ".join(review for review in df['tweetText'])
- print ("There are {} words in the combination of all review.".format(len(text)))
- # ## Begin cleaning
- # In[7]:
- import string
- import re
- string.punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~'
- def remove_punct(text):
- text = "".join([char for char in text if char not in string.punctuation])
- text = re.sub('[0-9]+', '', text)
- return text
- df['Tweet_punct'] = df['tweetText'].apply(lambda x: remove_punct(x))
- df.head(10)
- # In[8]:
- def tokenization(text):
- text = re.split('\W+', text)
- return text
- df['Tweet_tokenized'] = df['Tweet_punct'].apply(lambda x: tokenization(x.lower()))
- df.head()
- # In[9]:
- from stop_words import get_stop_words
- stop_words_lang = ['Arabic','Bulgarian','ca','Czech','Danish','Dutch','en','Finnish','French','German','Hindi', 'Hungarian','Indonesian','Italian','Norwegian','Norwegian','Polish','Portuguese','Romanian', 'Russian','Slovak','es','Swedish','Turkish','Ukrainian','Vietnamese']
- stopwordslist = []
- stopwordslist.clear()
- #stopwords = set(STOPWORDS)
- for lang in stop_words_lang:
- stopwordslist.extend(get_stop_words(lang.lower()))
- #stopwords.update(get_stop_words(lang.lower()))
- stopwordslist.sort()
- print(stopwordslist)
- # In[10]:
- def stoppedWords(listWords):
- listWordsCleaned = []
- for word in listWords:
- if word not in stopwordslist:
- listWordsCleaned.append(word)
- return listWordsCleaned
- df['Tweet_stopped'] = df['Tweet_tokenized'].apply(lambda listWords: stoppedWords(listWords))
- df.head()
- # ## Result counting text with all the words ( should be less than without)
- # In[21]:
- all_words = []
- #text = " ".join(review for review in df['tweetText'])
- for review in df['Tweet_stopped']:
- print(review)
- for word in review:
- textcleaned = " ".join(word)
- print("Text cleanded" + textcleaned)
- print ("Found {} words.".format(len(textcleaned)))
- #resul
- # In[ ]:
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement