Untitled

from __future__ import print_function
import os, codecs, nltk.stem, re, string, pymysql
stemmer = nltk.stem.SnowballStemmer('english')
def get_sentences():
    for root, dirs, files in os.walk("/Users/Documents/test1"):
        for file in files:
            if file.endswith(".txt"):
                with codecs.open(os.path.join(root,file),"r", "utf-8-sig") as x_:
                    for lines in x_.readlines():
                        yield lines
formoreprocessing = get_sentences()
from nltk.tokenize import sent_tokenize
raw_docs = sent_tokenize(''.join(formoreprocessing))

#Removing punctation marks
regex = re.compile('[%s]' % re.escape(string.punctuation))
nw = []
for review in raw_docs:
    new_review = ''.join(ch for ch in review if ch not in string.punctuation)
    nw.append(new_review)
# Removing numbers using dummy symbols
nr = (re.sub(r'[^[]]+(?=])', '#', j) for j in nw)
nrfinal = (re.sub('[0-9]+', '#', j) for j in nr)
from sklearn.feature_extraction.text import CountVectorizer
#The below class StemmedCountVectorizer does the following tasks:
#Lower casing the raw post in the preprocessing step (done in the parent class).
#Extracting all individual words in the tokenization step (done in the parent class).
#Converting each word into its stemmed version.
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: (stemmer.stem(w) for w in analyzer(doc))
vectorizer = StemmedCountVectorizer(min_df = 1, stop_words = 'english')
#print(vectorizer)
X_train = vectorizer.fit_transform(nrfinal).toarray()
num_samples, num_features = X_train.shape
print("#samples: %d, #features: %d" % (num_samples, num_features))
with codecs.open("/Users/Desktop/test_dict.txt", "w","utf-8-sig") as output:
    output.writelines(("Number of samples: %d, Number of features: %d" %  (num_samples, num_features))+'n')
    for tag, count in zip(vectorizer.get_feature_names(), sum(X_train)):
        output.writelines ('%10s ttt----------tt %5s' %(tag, count) + ‘n')

Also I tried to write it to database, the same problem occurs.

#Inserting into database
......
def connect():
    conn = pymysql.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'hawa' )
    cursor = conn.cursor()
    for tag, count in zip(vectorizer.get_feature_names(), sum(X_train)):
        cursor.execute("""INSERT INTO feature_name(Feature_name, Frequency) VALUES ('%s', '%d')""" %(tag, count))
    conn.commit()
    conn.close()
if __name__ == '__main__':
    connect()