Guest User

Untitled

a guest
May 21st, 2018
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.18 KB | None | 0 0
  1. import re
  2. from nltk.corpus import stopwords
  3. import pandas as pd
  4.  
  5. def preprocess(raw_text):
  6.  
  7. # keep only words
  8. letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)
  9.  
  10. # convert to lower case and split
  11. words = letters_only_text.lower().split()
  12.  
  13. # remove stopwords
  14. stopword_set = set(stopwords.words("english"))
  15. meaningful_words = [w for w in words if w not in stopword_set]
  16.  
  17. # join the cleaned words in a list
  18. cleaned_word_list = " ".join(meaningful_words)
  19.  
  20. return cleaned_word_list
  21.  
  22. def process_data(dataset):
  23. tweets_df = pd.read_csv(dataset,delimiter='|',header=None)
  24.  
  25. num_tweets = tweets_df.shape[0]
  26. print("Total tweets: " + str(num_tweets))
  27.  
  28. cleaned_tweets = []
  29. print("Beginning processing of tweets at: " + str(datetime.now()))
  30.  
  31. for i in range(num_tweets):
  32. cleaned_tweet = preprocess(tweets_df.iloc[i][1])
  33. cleaned_tweets.append(cleaned_tweet)
  34. if(i % 10000 == 0):
  35. print(str(i) + " tweets processed")
  36.  
  37. print("Finished processing of tweets at: " + str(datetime.now()))
  38. return cleaned_tweets
  39.  
  40. cleaned_data = process_data("tweets.csv)
  41.  
  42. Total tweets: 216041
  43. Beginning processing of tweets at: 2017-05-16 13:45:47.183113
  44. Finished processing of tweets at: 2017-05-16 13:47:01.436338
  45.  
  46. # column you are working on
  47. df_ = tweets_df[1]
  48.  
  49. stopword_set = set(stopwords.words("english"))
  50.  
  51. # convert to lower case and split
  52. df_ = df_.str.lower().split()
  53.  
  54. # remove stopwords
  55. df_ = df_.apply(lambda x: [item for item in x if item not in stopword_set])
  56.  
  57. # keep only words
  58. regex_pat = re.compile(r'[^a-zA-Zs]', flags=re.IGNORECASE)
  59. df_ = df_.str.replace(regex_pat, '')
  60.  
  61. # join the cleaned words in a list
  62. df_.str.join("")
  63.  
  64. def preprocess2(raw_text):
  65. stopword_set = set(stopwords.words("english"))
  66. return " ".join([i for i in re.sub(r'[^a-zA-Zs]', "", raw_text).lower().split() if i not in stopword_set])
  67.  
  68. import string
  69. from nltk.corpus import stopwords
  70. remove_word_list = set(stopwords.words("english")+list(string.punctuation))
  71. msg = "i need a refund asap of inr3232 and sa32dff? ."
  72. msg1 = ' '.join([y for y in msg.replace(',', ' ').split(' ') if y.isalpha()
  73. and y not in remove_word_list])
  74. print('msg1 == ', msg1)
Add Comment
Please, Sign In to add comment