daily pastebin goal
1%
SHARE
TWEET

Untitled

a guest Aug 13th, 2016 68 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. from __future__ import print_function
  2. import os, codecs, nltk.stem, re, string, pymysql
  3. stemmer = nltk.stem.SnowballStemmer('english')
  4. def get_sentences():
  5.     for root, dirs, files in os.walk("/Users/Documents/test1"):
  6.         for file in files:
  7.             if file.endswith(".txt"):
  8.                 with codecs.open(os.path.join(root,file),"r", "utf-8-sig") as x_:
  9.                     for lines in x_.readlines():
  10.                         yield lines
  11. formoreprocessing = get_sentences()
  12. from nltk.tokenize import sent_tokenize
  13. raw_docs = sent_tokenize(''.join(formoreprocessing))
  14.  
  15. #Removing punctation marks
  16. regex = re.compile('[%s]' % re.escape(string.punctuation))
  17. nw = []
  18. for review in raw_docs:
  19.     new_review = ''.join(ch for ch in review if ch not in string.punctuation)
  20.     nw.append(new_review)
  21. # Removing numbers using dummy symbols
  22. nr = (re.sub(r'[^[]]+(?=])', '#', j) for j in nw)
  23. nrfinal = (re.sub('[0-9]+', '#', j) for j in nr)
  24. from sklearn.feature_extraction.text import CountVectorizer
  25. #The below class StemmedCountVectorizer does the following tasks:
  26. #Lower casing the raw post in the preprocessing step (done in the parent class).
  27. #Extracting all individual words in the tokenization step (done in the parent class).
  28. #Converting each word into its stemmed version.
  29. class StemmedCountVectorizer(CountVectorizer):
  30.     def build_analyzer(self):
  31.         analyzer = super(StemmedCountVectorizer, self).build_analyzer()
  32.         return lambda doc: (stemmer.stem(w) for w in analyzer(doc))
  33. vectorizer = StemmedCountVectorizer(min_df = 1, stop_words = 'english')
  34. #print(vectorizer)
  35. X_train = vectorizer.fit_transform(nrfinal).toarray()
  36. num_samples, num_features = X_train.shape
  37. print("#samples: %d, #features: %d" % (num_samples, num_features))
  38. with codecs.open("/Users/Desktop/test_dict.txt", "w","utf-8-sig") as output:
  39.     output.writelines(("Number of samples: %d, Number of features: %d" %  (num_samples, num_features))+'n')
  40.     for tag, count in zip(vectorizer.get_feature_names(), sum(X_train)):
  41.         output.writelines ('%10s ttt----------tt %5s' %(tag, count) + ‘n')
  42.  
  43. Also I tried to write it to database, the same problem occurs.
  44.  
  45. #Inserting into database
  46. ......
  47. def connect():
  48.     conn = pymysql.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'hawa' )
  49.     cursor = conn.cursor()
  50.     for tag, count in zip(vectorizer.get_feature_names(), sum(X_train)):
  51.         cursor.execute("""INSERT INTO feature_name(Feature_name, Frequency) VALUES ('%s', '%d')""" %(tag, count))
  52.     conn.commit()
  53.     conn.close()
  54. if __name__ == '__main__':
  55.     connect()
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top