Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import re
- from nltk.corpus import stopwords
- import pandas as pd
- def preprocess(raw_text):
- # keep only words
- letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)
- # convert to lower case and split
- words = letters_only_text.lower().split()
- # remove stopwords
- stopword_set = set(stopwords.words("english"))
- meaningful_words = [w for w in words if w not in stopword_set]
- # join the cleaned words in a list
- cleaned_word_list = " ".join(meaningful_words)
- return cleaned_word_list
- def process_data(dataset):
- tweets_df = pd.read_csv(dataset,delimiter='|',header=None)
- num_tweets = tweets_df.shape[0]
- print("Total tweets: " + str(num_tweets))
- cleaned_tweets = []
- print("Beginning processing of tweets at: " + str(datetime.now()))
- for i in range(num_tweets):
- cleaned_tweet = preprocess(tweets_df.iloc[i][1])
- cleaned_tweets.append(cleaned_tweet)
- if(i % 10000 == 0):
- print(str(i) + " tweets processed")
- print("Finished processing of tweets at: " + str(datetime.now()))
- return cleaned_tweets
- cleaned_data = process_data("tweets.csv)
- Total tweets: 216041
- Beginning processing of tweets at: 2017-05-16 13:45:47.183113
- Finished processing of tweets at: 2017-05-16 13:47:01.436338
- # column you are working on
- df_ = tweets_df[1]
- stopword_set = set(stopwords.words("english"))
- # convert to lower case and split
- df_ = df_.str.lower().split()
- # remove stopwords
- df_ = df_.apply(lambda x: [item for item in x if item not in stopword_set])
- # keep only words
- regex_pat = re.compile(r'[^a-zA-Zs]', flags=re.IGNORECASE)
- df_ = df_.str.replace(regex_pat, '')
- # join the cleaned words in a list
- df_.str.join("")
- def preprocess2(raw_text):
- stopword_set = set(stopwords.words("english"))
- return " ".join([i for i in re.sub(r'[^a-zA-Zs]', "", raw_text).lower().split() if i not in stopword_set])
- import string
- from nltk.corpus import stopwords
- remove_word_list = set(stopwords.words("english")+list(string.punctuation))
- msg = "i need a refund asap of inr3232 and sa32dff? ."
- msg1 = ' '.join([y for y in msg.replace(',', ' ').split(' ') if y.isalpha()
- and y not in remove_word_list])
- print('msg1 == ', msg1)
Add Comment
Please, Sign In to add comment