Advertisement
Guest User

Untitled

a guest
Jun 8th, 2016
79
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 2.29 KB | None | 0 0
  1. # -*- coding: utf-8 -*-
  2. from __future__ import print_function
  3. import os, codecs, re, string, mysql
  4. import mysql.connector
  5.  
  6. '''Reading files with txt extension'''
  7. y_ = ""
  8. for root, dirs, files in os.walk("/Users/Documents/source-document/part1"):
  9. for file in files:
  10. if file.endswith(".txt"):
  11. x_ = codecs.open(os.path.join(root,file),"r", "utf-8-sig")
  12. for lines in x_.readlines():
  13. y_ = y_ + lines
  14. #print(tokenized_docs)
  15.  
  16. '''Tokenizing sentences of the text files'''
  17.  
  18. from nltk.tokenize import sent_tokenize
  19. raw_docs = sent_tokenize(y_)
  20.  
  21. tokenized_docs = [sent_tokenize(y_) for sent in raw_docs]
  22.  
  23. '''Removing stop words'''
  24.  
  25. stopword_removed_sentences = []
  26. from nltk.corpus import stopwords
  27. stopset = stopwords.words("English")
  28. for i in tokenized_docs[0]:
  29. tokenized_docs = ' '.join([word for word in i.split() if word not in stopset])
  30. stopword_removed_sentences.append(tokenized_docs)
  31.  
  32. ''' Removing punctuation marks'''
  33.  
  34. regex = re.compile('[%s]' % re.escape(string.punctuation))
  35. nw = []
  36. for review in stopword_removed_sentences:
  37. new_review = ''
  38. for token in review:
  39. new_token = regex.sub(u'', token)
  40. if not new_token == u'':
  41. new_review += new_token
  42. nw.append(new_review)
  43.  
  44. '''Lowercasing letters after removing puctuation marks.'''
  45.  
  46. lw = [] #lw stands for lowercase word.
  47. for i in nw:
  48. k = i.lower()
  49. lw.append(k)
  50.  
  51. '''Removing number with a dummy symbol'''
  52. nr = []
  53. for j in lw:
  54. string = j
  55. regex = r'[^[]]+(?=])'
  56. # let "#" be the dummy symbol
  57. output = re.sub(regex,'#',string)
  58. nr.append(output)
  59. nrfinal = []
  60. for j in nr:
  61. rem = 0
  62. outr = ''
  63. for i in j:
  64. if ord(i)>= 48 and ord(i)<=57:
  65. rem += 1
  66. if rem == 1:
  67. outr = outr+ '#'
  68. else:
  69. rem = 0
  70. outr = outr+i
  71. nrfinal.append(outr)
  72. '''Inserting into database'''
  73. def connect():
  74. for j in nrfinal:
  75. conn = mysql.connector.connect(user = 'root', password = '', unix_socket = "/tmp/mysql.sock", database = 'Thesis' )
  76. cursor = conn.cursor()
  77. cursor.execute("""INSERT INTO splitted_sentences(sentence_id, splitted_sentences) VALUES(%s, %s)""",(cursor.lastrowid,j))
  78. conn.commit()
  79. conn.close()
  80. if __name__ == '__main__':
  81. connect()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement