Advertisement
Guest User

Untitled

a guest
Aug 13th, 2016
86
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.54 KB | None | 0 0
  1. from __future__ import print_function
  2. import os, codecs, nltk.stem, re, string, pymysql
  3. stemmer = nltk.stem.SnowballStemmer('english')
  4. def get_sentences():
  5. for root, dirs, files in os.walk("/Users/Documents/test1"):
  6. for file in files:
  7. if file.endswith(".txt"):
  8. with codecs.open(os.path.join(root,file),"r", "utf-8-sig") as x_:
  9. for lines in x_.readlines():
  10. yield lines
  11. formoreprocessing = get_sentences()
  12. from nltk.tokenize import sent_tokenize
  13. raw_docs = sent_tokenize(''.join(formoreprocessing))
  14.  
  15. #Removing punctation marks
  16. regex = re.compile('[%s]' % re.escape(string.punctuation))
  17. nw = []
  18. for review in raw_docs:
  19. new_review = ''.join(ch for ch in review if ch not in string.punctuation)
  20. nw.append(new_review)
  21. # Removing numbers using dummy symbols
  22. nr = (re.sub(r'[^[]]+(?=])', '#', j) for j in nw)
  23. nrfinal = (re.sub('[0-9]+', '#', j) for j in nr)
  24. from sklearn.feature_extraction.text import CountVectorizer
  25. #The below class StemmedCountVectorizer does the following tasks:
  26. #Lower casing the raw post in the preprocessing step (done in the parent class).
  27. #Extracting all individual words in the tokenization step (done in the parent class).
  28. #Converting each word into its stemmed version.
  29. class StemmedCountVectorizer(CountVectorizer):
  30. def build_analyzer(self):
  31. analyzer = super(StemmedCountVectorizer, self).build_analyzer()
  32. return lambda doc: (stemmer.stem(w) for w in analyzer(doc))
  33. vectorizer = StemmedCountVectorizer(min_df = 1, stop_words = 'english')
  34. #print(vectorizer)
  35. X_train = vectorizer.fit_transform(nrfinal).toarray()
  36. num_samples, num_features = X_train.shape
  37. print("#samples: %d, #features: %d" % (num_samples, num_features))
  38. with codecs.open("/Users/Desktop/test_dict.txt", "w","utf-8-sig") as output:
  39. output.writelines(("Number of samples: %d, Number of features: %d" % (num_samples, num_features))+'n')
  40. for tag, count in zip(vectorizer.get_feature_names(), sum(X_train)):
  41. output.writelines ('%10s ttt----------tt %5s' %(tag, count) + ‘n')
  42.  
  43. Also I tried to write it to database, the same problem occurs.
  44.  
  45. #Inserting into database
  46. ......
  47. def connect():
  48. conn = pymysql.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'hawa' )
  49. cursor = conn.cursor()
  50. for tag, count in zip(vectorizer.get_feature_names(), sum(X_train)):
  51. cursor.execute("""INSERT INTO feature_name(Feature_name, Frequency) VALUES ('%s', '%d')""" %(tag, count))
  52. conn.commit()
  53. conn.close()
  54. if __name__ == '__main__':
  55. connect()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement