Untitled

import re
from nltk.corpus import stopwords
import pandas as pd

def preprocess(raw_text):

    # keep only words
    letters_only_text = re.sub("[^a-zA-Z]", " ", raw_text)

    # convert to lower case and split
    words = letters_only_text.lower().split()

    # remove stopwords
    stopword_set = set(stopwords.words("english"))
    meaningful_words = [w for w in words if w not in stopword_set]

    # join the cleaned words in a list
    cleaned_word_list = " ".join(meaningful_words)

    return cleaned_word_list

def process_data(dataset):
    tweets_df = pd.read_csv(dataset,delimiter='|',header=None)

    num_tweets = tweets_df.shape[0]
    print("Total tweets: " + str(num_tweets))

    cleaned_tweets = []
    print("Beginning processing of tweets at: " + str(datetime.now()))

    for i in range(num_tweets):
        cleaned_tweet = preprocess(tweets_df.iloc[i][1])
        cleaned_tweets.append(cleaned_tweet)
        if(i % 10000 == 0):
            print(str(i) + " tweets processed")

    print("Finished processing of tweets at: " + str(datetime.now()))
    return cleaned_tweets

cleaned_data = process_data("tweets.csv)

Total tweets: 216041
Beginning processing of tweets at: 2017-05-16 13:45:47.183113
Finished processing of tweets at: 2017-05-16 13:47:01.436338

# column you are working on
df_ = tweets_df[1]

stopword_set = set(stopwords.words("english"))

# convert to lower case and split
df_ = df_.str.lower().split()

# remove stopwords
df_ = df_.apply(lambda x: [item for item in x if item not in stopword_set])

# keep only words
regex_pat = re.compile(r'[^a-zA-Zs]', flags=re.IGNORECASE)
df_ = df_.str.replace(regex_pat, '')

# join the cleaned words in a list
df_.str.join("")

def preprocess2(raw_text):
    stopword_set = set(stopwords.words("english"))
    return " ".join([i for i in re.sub(r'[^a-zA-Zs]', "", raw_text).lower().split() if i not in stopword_set])

import string
from nltk.corpus import stopwords
remove_word_list = set(stopwords.words("english")+list(string.punctuation))
msg = "i need a refund asap of inr3232 and sa32dff? ."
msg1 = ' '.join([y for y in msg.replace(',', ' ').split(' ') if y.isalpha()
and y not in remove_word_list])
print('msg1 == ', msg1)