Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- from __future__ import print_function
- import os, codecs, mysql, mysql.connector, re, string
- #Reading files with txt extension
- def get_sentences():
- for root, dirs, files in os.walk("/Users/Documents/sourcedocument/test1"):
- for file in files:
- if file.endswith(".txt"):
- x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
- for lines in x_.readlines():
- yield lines
- formoreprocessing = get_sentences()
- #Tokenizing sentences of the text files
- from nltk.tokenize import sent_tokenize
- for i in formoreprocessing:
- raw_docs = sent_tokenize(i)
- tokenized_docs = [sent_tokenize(i) for sent in raw_docs]
- # print (tokenized_docs)
- #Removing stop words
- stopword_removed_sentences = []
- from nltk.corpus import stopwords
- stopset = stopwords.words("English")
- def strip_stopwords(sentence):
- return ' '.join(word for word in sentence.split() if word not in stopset)
- stopword_removed_sentences = (strip_stopwords(sentence) for sentence in raw_docs)
- ------------>error here
- #Removing punctation marks
- regex = re.compile('[%s]' % re.escape(string.punctuation))
- nw = []
- for review in stopword_removed_sentences:
- new_review = ''
- for token in review:
- new_token = regex.sub(u'', token)
- if not new_token == u'':
- new_review += new_token
- nw.append(new_review)
- #Lowercasing letters after removing puctuation marks.
- lw = (i.lower() for i in nw)
- #Removing number with a dummy symbol
- nr = (re.sub(r'[^[]]+(?=])', '#', j) for j in lw)
- nrfinal = (re.sub('[0-9]+', '#', j) for j in nr)
- #Inserting into database
- def connect():
- conn = mysql.connector.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'Thesis' )
- cursor = conn.cursor()
- for j in nrfinal:
- cursor.execute("""INSERT INTO splitted_sentences(splitted_sentences) VALUES (%s)""",(j,))
- conn.commit()
- conn.close()
- if __name__ == '__main__':
- connect()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement