Advertisement
Guest User

Untitled

a guest
Jun 14th, 2016
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.96 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. import os, codecs, mysql, mysql.connector, re, string
  4.  
  5. #Reading files with txt extension
  6. def get_sentences():
  7. for root, dirs, files in os.walk("/Users/Documents/sourcedocument/test1"):
  8. for file in files:
  9. if file.endswith(".txt"):
  10. x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
  11. for lines in x_.readlines():
  12. yield lines
  13. formoreprocessing = get_sentences()
  14.  
  15. #Tokenizing sentences of the text files
  16. from nltk.tokenize import sent_tokenize
  17. for i in formoreprocessing:
  18. raw_docs = sent_tokenize(i)
  19. tokenized_docs = [sent_tokenize(i) for sent in raw_docs]
  20. # print (tokenized_docs)
  21.  
  22. #Removing stop words
  23. stopword_removed_sentences = []
  24. from nltk.corpus import stopwords
  25. stopset = stopwords.words("English")
  26. def strip_stopwords(sentence):
  27. return ' '.join(word for word in sentence.split() if word not in stopset)
  28. stopword_removed_sentences = (strip_stopwords(sentence) for sentence in raw_docs)
  29. ------------>error here
  30. #Removing punctation marks
  31. regex = re.compile('[%s]' % re.escape(string.punctuation))
  32. nw = []
  33. for review in stopword_removed_sentences:
  34. new_review = ''
  35. for token in review:
  36. new_token = regex.sub(u'', token)
  37. if not new_token == u'':
  38. new_review += new_token
  39. nw.append(new_review)
  40.  
  41. #Lowercasing letters after removing puctuation marks.
  42. lw = (i.lower() for i in nw)
  43. #Removing number with a dummy symbol
  44. nr = (re.sub(r'[^[]]+(?=])', '#', j) for j in lw)
  45. nrfinal = (re.sub('[0-9]+', '#', j) for j in nr)
  46.  
  47. #Inserting into database
  48. def connect():
  49. conn = mysql.connector.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'Thesis' )
  50. cursor = conn.cursor()
  51. for j in nrfinal:
  52. cursor.execute("""INSERT INTO splitted_sentences(splitted_sentences) VALUES (%s)""",(j,))
  53. conn.commit()
  54. conn.close()
  55. if __name__ == '__main__':
  56. connect()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement