Advertisement
Guest User

Untitled

a guest
Jun 26th, 2019
91
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.13 KB | None | 0 0
  1. import json
  2. import logging as log
  3. import os
  4. import sys
  5. import subprocess
  6. import time
  7.  
  8. log.basicConfig(
  9. format='%(asctime)s [%(levelname)s]: %(message)s',
  10. datefmt='%Y-%m-%d %H:%M:%S',
  11. level=log.DEBUG,
  12. handlers=[
  13. log.StreamHandler(),
  14. log.FileHandler('{}.log'.format(__name__))
  15. ]
  16. )
  17.  
  18.  
  19. class InitialParser:
  20.  
  21. def __init__(self, filename, s3_bucket, version=1):
  22. self.filename = filename
  23. self.s3_bucket = s3_bucket
  24. self.version = version
  25.  
  26. self.file = None
  27. self.filepath = None
  28. self.data = None
  29. self.output_filename = self.filename.replace('.pdf', '.txt')
  30.  
  31. def run(self):
  32. log.info('Running initial parser on file {}'.format(self.filename))
  33. if self.run_textract():
  34. log.info('Textract process successful')
  35. else:
  36. log.info('Textract process failed, exiting')
  37. sys.exit(0)
  38.  
  39. def load_file_data(self):
  40. self.filepath = os.path.join(os.getcwd(), '../extracted/{}'.format(filename))
  41. try:
  42. with open(self.filepath, 'r') as file:
  43. self.file = file
  44. self.data = json.load(file)
  45. except FileNotFoundError as e:
  46. log.error('Could not find parsing file: {}'.format(e))
  47.  
  48. def run_textract(self):
  49. """
  50. Runs AWS Textract commands on the given document and sends the output
  51. to a .txt file.
  52.  
  53. @param s3_bucket: the S3 Bucket where the document is located.
  54. """
  55. job_id = ''
  56. # note: adding "Version":<str> to the AWS object below breaks the command
  57. aws_object = json.dumps({"S3Object":{"Bucket":self.s3_bucket,"Name":self.filename}}).replace(' ', '') # can't have any spaces lol picky AWS CLI
  58.  
  59. start_textract_command = "aws textract start-document-text-detection --document-location '{}'".format(aws_object)
  60. get_textract_output_command = 'aws textract get-document-text-detection --job-id '
  61.  
  62. try:
  63. job_id = '"{}"'.format(json.loads(subprocess.check_output([start_textract_command], shell=True, stderr=subprocess.STDOUT).decode('utf-8'))['JobId'])
  64. except subprocess.CalledProcessError as e:
  65. if 'InvalidS3ObjectException' in e.output.decode('utf-8'):
  66. log.error('InvalidS3ObjectException (could not fetch object metadata from S3).n Check the document name, AWS CLI configuration region (run `aws configure list`), permissions, and the S3 Bucket name & region.')
  67. elif 'ProvisionedThroughputExceededException' in e.output.decode('utf-8'):
  68. log.error('ProvisionedThroughputExceededException (provisioned rate exceeded). You're doing that too much.')
  69. else:
  70. log.error('Starting Textract failed. Error: {}'.format(e.output.decode('utf-8')))
  71.  
  72. time.sleep(10) # wait for Textract to do its' thing
  73.  
  74. if job_id != '':
  75. try:
  76. subprocess.call(['touch {}'.format(self.output_filename)], shell=True)
  77. subprocess.call(['{} > {}'.format(get_textract_output_command+job_id, self.output_filename)], shell=True, stderr=subprocess.STDOUT)
  78. return True
  79. except subprocess.CalledProcessError as e:
  80. log.error(e.output)
  81. else:
  82. return False
  83.  
  84.  
  85.  
  86. if __name__ == '__main__':
  87. initial_parser = InitialParser(
  88. filename='test1.pdf',
  89. s3_bucket='test-bucket',
  90. )
  91. initial_parser.run()
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement