Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import json
- import logging as log
- import os
- import sys
- import subprocess
- import time
- log.basicConfig(
- format='%(asctime)s [%(levelname)s]: %(message)s',
- datefmt='%Y-%m-%d %H:%M:%S',
- level=log.DEBUG,
- handlers=[
- log.StreamHandler(),
- log.FileHandler('{}.log'.format(__name__))
- ]
- )
- class InitialParser:
- def __init__(self, filename, s3_bucket, version=1):
- self.filename = filename
- self.s3_bucket = s3_bucket
- self.version = version
- self.file = None
- self.filepath = None
- self.data = None
- self.output_filename = self.filename.replace('.pdf', '.txt')
- def run(self):
- log.info('Running initial parser on file {}'.format(self.filename))
- if self.run_textract():
- log.info('Textract process successful')
- else:
- log.info('Textract process failed, exiting')
- sys.exit(0)
- def load_file_data(self):
- self.filepath = os.path.join(os.getcwd(), '../extracted/{}'.format(filename))
- try:
- with open(self.filepath, 'r') as file:
- self.file = file
- self.data = json.load(file)
- except FileNotFoundError as e:
- log.error('Could not find parsing file: {}'.format(e))
- def run_textract(self):
- """
- Runs AWS Textract commands on the given document and sends the output
- to a .txt file.
- @param s3_bucket: the S3 Bucket where the document is located.
- """
- job_id = ''
- # note: adding "Version":<str> to the AWS object below breaks the command
- aws_object = json.dumps({"S3Object":{"Bucket":self.s3_bucket,"Name":self.filename}}).replace(' ', '') # can't have any spaces lol picky AWS CLI
- start_textract_command = "aws textract start-document-text-detection --document-location '{}'".format(aws_object)
- get_textract_output_command = 'aws textract get-document-text-detection --job-id '
- try:
- job_id = '"{}"'.format(json.loads(subprocess.check_output([start_textract_command], shell=True, stderr=subprocess.STDOUT).decode('utf-8'))['JobId'])
- except subprocess.CalledProcessError as e:
- if 'InvalidS3ObjectException' in e.output.decode('utf-8'):
- log.error('InvalidS3ObjectException (could not fetch object metadata from S3).n Check the document name, AWS CLI configuration region (run `aws configure list`), permissions, and the S3 Bucket name & region.')
- elif 'ProvisionedThroughputExceededException' in e.output.decode('utf-8'):
- log.error('ProvisionedThroughputExceededException (provisioned rate exceeded). You're doing that too much.')
- else:
- log.error('Starting Textract failed. Error: {}'.format(e.output.decode('utf-8')))
- time.sleep(10) # wait for Textract to do its' thing
- if job_id != '':
- try:
- subprocess.call(['touch {}'.format(self.output_filename)], shell=True)
- subprocess.call(['{} > {}'.format(get_textract_output_command+job_id, self.output_filename)], shell=True, stderr=subprocess.STDOUT)
- return True
- except subprocess.CalledProcessError as e:
- log.error(e.output)
- else:
- return False
- if __name__ == '__main__':
- initial_parser = InitialParser(
- filename='test1.pdf',
- s3_bucket='test-bucket',
- )
- initial_parser.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement