Guest User

Untitled

a guest
Jul 15th, 2018
146
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.21 KB | None | 0 0
  1. from tika import parser
  2. from mysql import connector
  3. from multiprocessing import Pool
  4.  
  5.  
  6. def tika_parser(file_path):
  7. # Extract text from document
  8. content = parser.from_file(file_path)
  9. if 'content' in content:
  10. text = content['content']
  11. else:
  12. return
  13. # Convert to string
  14. text = str(text)
  15. # Ensure text is utf-8 formatted
  16. safe_text = text.encode('utf-8', errors='ignore')
  17. # Escape any \ issues
  18. safe_text = str(safe_text).replace('\\', '\\\\').replace('"', '\\"')
  19. # Connect and send to database (in multiprocessing must re-connect each time)
  20. update_query = f'UPDATE file_index SET content = "{safe_text}" WHERE path = "{file_path}";'
  21. connection = connector.connect(database='clustering', user='root', password='mydba')
  22. cursor = cnx.cursor()
  23. cursor.execute(update_query)
  24. connection.commit()
  25. cur.close()
  26. cnx.close()
  27.  
  28.  
  29. if __name__ == '__main__':
  30. # Retrieve file paths from database
  31. query = 'SELECT path from content;'
  32. cnx = connector.connect(database='clustering', user='root', password='mydba')
  33. cur = cnx.cursor()
  34. cur.execute(query)
  35. paths = cur.fetchall()
  36. cur.close()
  37. cnx.close()
  38.  
  39. pool = Pool()
  40. pool.map(tika_parser, paths)
Add Comment
Please, Sign In to add comment