Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # -*- coding: utf-8 -*-
- from __future__ import print_function
- import os, codecs, re, string, mysql
- import mysql.connector
- '''Reading files with txt extension'''
- y_ = ""
- for root, dirs, files in os.walk("/Users/Documents/source-document/part1"):
- for file in files:
- if file.endswith(".txt"):
- x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
- for lines in x_.readlines():
- y_ = y_ + lines
- #print(tokenized_docs)
- '''Tokenizing sentences of the text files'''
- from nltk.tokenize import sent_tokenize
- raw_docs = sent_tokenize(y_)
- tokenized_docs = [sent_tokenize(y_) for sent in raw_docs]
- '''Removing stop words'''
- stopword_removed_sentences = []
- from nltk.corpus import stopwords
- stopset = stopwords.words("English")
- for i in tokenized_docs[0]:
- tokenized_docs = ' '.join([word for word in i.split() if word not in stopset])
- stopword_removed_sentences.append(tokenized_docs)
- ''' Removing punctuation marks'''
- regex = re.compile('[%s]' % re.escape(string.punctuation))
- nw = []
- for review in stopword_removed_sentences:
- new_review = ''
- for token in review:
- new_token = regex.sub(u'', token)
- if not new_token == u'':
- new_review += new_token
- nw.append(new_review)
- '''Lowercasing letters after removing puctuation marks.'''
- lw = [] #lw stands for lowercase word.
- for i in nw:
- k = i.lower()
- lw.append(k)
- '''Removing number with a dummy symbol'''
- nr = []
- for j in lw:
- string = j
- regex = r'[^[]]+(?=])'
- # let "#" be the dummy symbol
- output = re.sub(regex,'#',string)
- nr.append(output)
- nrfinal = []
- for j in nr:
- rem = 0
- outr = ''
- for i in j:
- if ord(i)>= 48 and ord(i)<=57:
- rem += 1
- if rem == 1:
- outr = outr+ '#'
- else:
- rem = 0
- outr = outr+i
- nrfinal.append(outr)
- '''Inserting into database'''
- def connect():
- for j in nrfinal:
- conn = mysql.connector.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'Thesis' )
- cursor = conn.cursor()
- cursor.execute("""INSERT INTO splitted_sentences(sentence_id, splitted_sentences) VALUES(%s, %s)""",(cursor.lastrowid,j))
- conn.commit()
- conn.close()
- if __name__ == '__main__':
- connect()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement