Advertisement
Guest User

Untitled

a guest
Jul 18th, 2019
137
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.37 KB | None | 0 0
  1. #!/usr/bin/env python
  2. # coding: utf-8
  3.  
  4. # In[5]:
  5.  
  6.  
  7. import locale
  8. import pandas as pd
  9.  
  10. locale.setlocale(locale.LC_ALL,'es_ES.UTF-8')
  11.  
  12. pd.set_option('display.float_format', lambda x: locale.format_string('%.0f', x, grouping=True))
  13.  
  14. df = pd.read_csv('twitter_cleanedsample.csv')
  15.  
  16. df.head()
  17.  
  18.  
  19. # ## Text before cleaning anything
  20.  
  21. # In[13]:
  22.  
  23.  
  24. text = " ".join(review for review in df['tweetText'])
  25.  
  26. print ("There are {} words in the combination of all review.".format(len(text)))
  27.  
  28.  
  29. # ## Begin cleaning
  30.  
  31. # In[7]:
  32.  
  33.  
  34. import string
  35. import re
  36. string.punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~'
  37.  
  38. def remove_punct(text):
  39. text = "".join([char for char in text if char not in string.punctuation])
  40. text = re.sub('[0-9]+', '', text)
  41. return text
  42.  
  43. df['Tweet_punct'] = df['tweetText'].apply(lambda x: remove_punct(x))
  44. df.head(10)
  45.  
  46.  
  47. # In[8]:
  48.  
  49.  
  50. def tokenization(text):
  51. text = re.split('\W+', text)
  52. return text
  53.  
  54. df['Tweet_tokenized'] = df['Tweet_punct'].apply(lambda x: tokenization(x.lower()))
  55. df.head()
  56.  
  57.  
  58. # In[9]:
  59.  
  60.  
  61. from stop_words import get_stop_words
  62.  
  63. stop_words_lang = ['Arabic','Bulgarian','ca','Czech','Danish','Dutch','en','Finnish','French','German','Hindi', 'Hungarian','Indonesian','Italian','Norwegian','Norwegian','Polish','Portuguese','Romanian', 'Russian','Slovak','es','Swedish','Turkish','Ukrainian','Vietnamese']
  64. stopwordslist = []
  65. stopwordslist.clear()
  66. #stopwords = set(STOPWORDS)
  67. for lang in stop_words_lang:
  68. stopwordslist.extend(get_stop_words(lang.lower()))
  69. #stopwords.update(get_stop_words(lang.lower()))
  70.  
  71. stopwordslist.sort()
  72. print(stopwordslist)
  73.  
  74.  
  75. # In[10]:
  76.  
  77.  
  78. def stoppedWords(listWords):
  79. listWordsCleaned = []
  80. for word in listWords:
  81. if word not in stopwordslist:
  82. listWordsCleaned.append(word)
  83. return listWordsCleaned
  84.  
  85. df['Tweet_stopped'] = df['Tweet_tokenized'].apply(lambda listWords: stoppedWords(listWords))
  86. df.head()
  87.  
  88.  
  89. # ## Result counting text with all the words ( should be less than without)
  90.  
  91. # In[21]:
  92.  
  93.  
  94. all_words = []
  95. #text = " ".join(review for review in df['tweetText'])
  96.  
  97. for review in df['Tweet_stopped']:
  98. print(review)
  99. for word in review:
  100. textcleaned = " ".join(word)
  101. print("Text cleanded" + textcleaned)
  102.  
  103.  
  104.  
  105. print ("Found {} words.".format(len(textcleaned)))
  106. #resul
  107.  
  108.  
  109. # In[ ]:
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement