Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from __future__ import print_function
- import os, codecs, nltk.stem, re, string, pymysql
- stemmer = nltk.stem.SnowballStemmer('english')
- def get_sentences():
- for root, dirs, files in os.walk("/Users/Documents/test1"):
- for file in files:
- if file.endswith(".txt"):
- with codecs.open(os.path.join(root,file),"r", "utf-8-sig") as x_:
- for lines in x_.readlines():
- yield lines
- formoreprocessing = get_sentences()
- from nltk.tokenize import sent_tokenize
- raw_docs = sent_tokenize(''.join(formoreprocessing))
- #Removing punctation marks
- regex = re.compile('[%s]' % re.escape(string.punctuation))
- nw = []
- for review in raw_docs:
- new_review = ''.join(ch for ch in review if ch not in string.punctuation)
- nw.append(new_review)
- # Removing numbers using dummy symbols
- nr = (re.sub(r'[^[]]+(?=])', '#', j) for j in nw)
- nrfinal = (re.sub('[0-9]+', '#', j) for j in nr)
- from sklearn.feature_extraction.text import CountVectorizer
- #The below class StemmedCountVectorizer does the following tasks:
- #Lower casing the raw post in the preprocessing step (done in the parent class).
- #Extracting all individual words in the tokenization step (done in the parent class).
- #Converting each word into its stemmed version.
- class StemmedCountVectorizer(CountVectorizer):
- def build_analyzer(self):
- analyzer = super(StemmedCountVectorizer, self).build_analyzer()
- return lambda doc: (stemmer.stem(w) for w in analyzer(doc))
- vectorizer = StemmedCountVectorizer(min_df = 1, stop_words = 'english')
- #print(vectorizer)
- X_train = vectorizer.fit_transform(nrfinal).toarray()
- num_samples, num_features = X_train.shape
- print("#samples: %d, #features: %d" % (num_samples, num_features))
- with codecs.open("/Users/Desktop/test_dict.txt", "w","utf-8-sig") as output:
- output.writelines(("Number of samples: %d, Number of features: %d" % (num_samples, num_features))+'n')
- for tag, count in zip(vectorizer.get_feature_names(), sum(X_train)):
- output.writelines ('%10s ttt----------tt %5s' %(tag, count) + ‘n')
- Also I tried to write it to database, the same problem occurs.
- #Inserting into database
- ......
- def connect():
- conn = pymysql.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'hawa' )
- cursor = conn.cursor()
- for tag, count in zip(vectorizer.get_feature_names(), sum(X_train)):
- cursor.execute("""INSERT INTO feature_name(Feature_name, Frequency) VALUES ('%s', '%d')""" %(tag, count))
- conn.commit()
- conn.close()
- if __name__ == '__main__':
- connect()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement