Untitled

import json
import logging as log
import os
import sys
import subprocess
import time

log.basicConfig(
  format='%(asctime)s [%(levelname)s]: %(message)s',
  datefmt='%Y-%m-%d %H:%M:%S',
  level=log.DEBUG,
  handlers=[
    log.StreamHandler(),
    log.FileHandler('{}.log'.format(__name__))
  ]
)


class InitialParser:

  def __init__(self, filename, s3_bucket, version=1):
    self.filename = filename
    self.s3_bucket = s3_bucket
    self.version = version

    self.file = None
    self.filepath = None
    self.data = None
    self.output_filename = self.filename.replace('.pdf', '.txt')

  def run(self):
    log.info('Running initial parser on file {}'.format(self.filename))
    if self.run_textract():
      log.info('Textract process successful')
    else:
      log.info('Textract process failed, exiting')
      sys.exit(0)

  def load_file_data(self):
    self.filepath = os.path.join(os.getcwd(), '../extracted/{}'.format(filename))
    try:
      with open(self.filepath, 'r') as file:
        self.file = file
        self.data = json.load(file)
    except FileNotFoundError as e:
      log.error('Could not find parsing file: {}'.format(e))

  def run_textract(self):
    """
    Runs AWS Textract commands on the given document and sends the output
    to a .txt file.

    @param s3_bucket: the S3 Bucket where the document is located.
    """
    job_id = ''
    # note: adding "Version":<str> to the AWS object below breaks the command
    aws_object = json.dumps({"S3Object":{"Bucket":self.s3_bucket,"Name":self.filename}}).replace(' ', '')  # can't have any spaces lol picky AWS CLI

    start_textract_command = "aws textract start-document-text-detection --document-location '{}'".format(aws_object)
    get_textract_output_command = 'aws textract get-document-text-detection --job-id '

    try:
      job_id = '"{}"'.format(json.loads(subprocess.check_output([start_textract_command], shell=True, stderr=subprocess.STDOUT).decode('utf-8'))['JobId'])
    except subprocess.CalledProcessError as e:
      if 'InvalidS3ObjectException' in e.output.decode('utf-8'):
        log.error('InvalidS3ObjectException (could not fetch object metadata from S3).n Check the document name, AWS CLI configuration region (run `aws configure list`), permissions, and the S3 Bucket name & region.')
      elif 'ProvisionedThroughputExceededException' in e.output.decode('utf-8'):
        log.error('ProvisionedThroughputExceededException (provisioned rate exceeded). You're doing that too much.')
      else:
        log.error('Starting Textract failed. Error: {}'.format(e.output.decode('utf-8')))

    time.sleep(10)  # wait for Textract to do its' thing

    if job_id != '':
      try:
        subprocess.call(['touch {}'.format(self.output_filename)], shell=True)
        subprocess.call(['{} > {}'.format(get_textract_output_command+job_id, self.output_filename)], shell=True, stderr=subprocess.STDOUT)
        return True
      except subprocess.CalledProcessError as e:
        log.error(e.output)
    else:
      return False


if __name__ == '__main__':
  initial_parser = InitialParser(
    filename='test1.pdf',
    s3_bucket='test-bucket',
  )
  initial_parser.run()