Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- from tika import parser
- from mysql import connector
- from multiprocessing import Pool
- def tika_parser(file_path):
- # Extract text from document
- content = parser.from_file(file_path)
- if 'content' in content:
- text = content['content']
- else:
- return
- # Convert to string
- text = str(text)
- # Ensure text is utf-8 formatted
- safe_text = text.encode('utf-8', errors='ignore')
- # Escape any \ issues
- safe_text = str(safe_text).replace('\\', '\\\\').replace('"', '\\"')
- # Connect and send to database (in multiprocessing must re-connect each time)
- update_query = f'UPDATE file_index SET content = "{safe_text}" WHERE path = "{file_path}";'
- connection = connector.connect(database='clustering', user='root', password='mydba')
- cursor = cnx.cursor()
- cursor.execute(update_query)
- connection.commit()
- cur.close()
- cnx.close()
- if __name__ == '__main__':
- # Retrieve file paths from database
- query = 'SELECT path from content;'
- cnx = connector.connect(database='clustering', user='root', password='mydba')
- cur = cnx.cursor()
- cur.execute(query)
- paths = cur.fetchall()
- cur.close()
- cnx.close()
- pool = Pool()
- pool.map(tika_parser, paths)
Add Comment
Please, Sign In to add comment