SHARE
TWEET

Untitled

a guest Jul 18th, 2019 69 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3.  
  4. # In[5]:
  5.  
  6.  
  7. import locale
  8. import pandas as pd
  9.  
  10. locale.setlocale(locale.LC_ALL,'es_ES.UTF-8')
  11.  
  12. pd.set_option('display.float_format', lambda x: locale.format_string('%.0f', x, grouping=True))
  13.  
  14. df = pd.read_csv('twitter_cleanedsample.csv')
  15.  
  16. df.head()
  17.  
  18.  
  19. # ## Text before cleaning anything
  20.  
  21. # In[13]:
  22.  
  23.  
  24. text = " ".join(review for review in df['tweetText'])
  25.  
  26. print ("There are {} words in the combination of all review.".format(len(text)))
  27.  
  28.  
  29. # ## Begin cleaning
  30.  
  31. # In[7]:
  32.  
  33.  
  34. import string
  35. import re
  36. string.punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~'
  37.  
  38. def remove_punct(text):
  39.     text  = "".join([char for char in text if char not in string.punctuation])
  40.     text = re.sub('[0-9]+', '', text)
  41.     return text
  42.  
  43. df['Tweet_punct'] = df['tweetText'].apply(lambda x: remove_punct(x))
  44. df.head(10)
  45.  
  46.  
  47. # In[8]:
  48.  
  49.  
  50. def tokenization(text):
  51.     text = re.split('\W+', text)
  52.     return text
  53.  
  54. df['Tweet_tokenized'] = df['Tweet_punct'].apply(lambda x: tokenization(x.lower()))
  55. df.head()
  56.  
  57.  
  58. # In[9]:
  59.  
  60.  
  61. from stop_words import get_stop_words
  62.  
  63. stop_words_lang = ['Arabic','Bulgarian','ca','Czech','Danish','Dutch','en','Finnish','French','German','Hindi',                   'Hungarian','Indonesian','Italian','Norwegian','Norwegian','Polish','Portuguese','Romanian',                   'Russian','Slovak','es','Swedish','Turkish','Ukrainian','Vietnamese']
  64. stopwordslist = []
  65. stopwordslist.clear()
  66. #stopwords = set(STOPWORDS)
  67. for lang in stop_words_lang:
  68.     stopwordslist.extend(get_stop_words(lang.lower()))
  69.     #stopwords.update(get_stop_words(lang.lower()))
  70.  
  71. stopwordslist.sort()
  72. print(stopwordslist)
  73.  
  74.  
  75. # In[10]:
  76.  
  77.  
  78. def stoppedWords(listWords):
  79.     listWordsCleaned = []
  80.     for word in listWords:
  81.         if word not in stopwordslist:
  82.             listWordsCleaned.append(word)
  83.     return listWordsCleaned
  84.  
  85. df['Tweet_stopped'] = df['Tweet_tokenized'].apply(lambda listWords: stoppedWords(listWords))
  86. df.head()
  87.  
  88.  
  89. # ## Result counting text with all the words ( should be less than without)
  90.  
  91. # In[21]:
  92.  
  93.  
  94. all_words = []
  95. #text = " ".join(review for review in df['tweetText'])
  96.  
  97. for review in df['Tweet_stopped']:
  98.     print(review)
  99.     for word in review:
  100.         textcleaned = " ".join(word)
  101.         print("Text cleanded" + textcleaned)
  102.  
  103.  
  104.  
  105. print ("Found {} words.".format(len(textcleaned)))
  106. #resul
  107.  
  108.  
  109. # In[ ]:
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top