Untitled

from tika import parser
from mysql import connector
from multiprocessing import Pool


def tika_parser(file_path):
    # Extract text from document
    content = parser.from_file(file_path)
    if 'content' in content:
        text = content['content']
    else:
        return
    # Convert to string
    text = str(text)
    # Ensure text is utf-8 formatted
    safe_text = text.encode('utf-8', errors='ignore')
    # Escape any \ issues
    safe_text = str(safe_text).replace('\\', '\\\\').replace('"', '\\"')
    # Connect and send to database (in multiprocessing must re-connect each time)
    update_query = f'UPDATE file_index SET content = "{safe_text}" WHERE path = "{file_path}";'
    connection = connector.connect(database='clustering', user='root', password='mydba')
    cursor = cnx.cursor()
    cursor.execute(update_query)
    connection.commit()
    cur.close()
    cnx.close()


if __name__ == '__main__':
    # Retrieve file paths from database
    query = 'SELECT path from content;'
    cnx = connector.connect(database='clustering', user='root', password='mydba')
    cur = cnx.cursor()
    cur.execute(query)
    paths = cur.fetchall()
    cur.close()
    cnx.close()

    pool = Pool()
    pool.map(tika_parser, paths)