Advertisement
Guest User

Untitled

a guest
Jun 24th, 2019
77
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 1.81 KB | None | 0 0
  1. #!/usr/bin/env python3
  2.  
  3. # Detects text in a document stored in an S3 bucket.
  4. import boto3
  5. import sys
  6. from time import sleep
  7. import math
  8. import pandas as pd
  9.  
  10.  
  11. if __name__ == "__main__":
  12.  
  13. bucket='your_bucket_name'
  14. ACCESS_KEY='your_access_key'
  15. SECRET_KEY='your_secret_key'
  16.  
  17. client = boto3.client('textract',
  18. region_name='your_region',
  19. aws_access_key_id=ACCESS_KEY,
  20. aws_secret_access_key=SECRET_KEY)
  21.  
  22. s3 = boto3.resource('s3',
  23. aws_access_key_id=ACCESS_KEY,
  24. aws_secret_access_key=SECRET_KEY)
  25.  
  26. your_bucket = s3.Bucket(bucket)
  27.  
  28. extracted_data = []
  29. for s3_file in your_bucket.objects.all():
  30. print(s3_file)
  31.  
  32. # use textract to process s3 file
  33. response = client.detect_document_text(
  34. Document={'S3Object': {'Bucket': bucket, 'Name': s3_file.key}})
  35.  
  36. blocks=response['Blocks']
  37.  
  38. for block in blocks:
  39. if block['BlockType'] != 'PAGE':
  40. print('Detected: ' + block['Text'])
  41. print('Confidence: ' + "{:.2f}".format(block['Confidence']) + "%")
  42.  
  43. # Example case where you want to extract words with #
  44. if("#" in block['Text']):
  45. words = block['Text'].split()
  46. for word in words:
  47. if("#" in word):
  48. extracted_data.append({"word" : word, "file" : s3_file.key, "confidence": "{:.2f}".format(block['Confidence']) + "%"})
  49.  
  50. # sleep 2 seconds to prevent ProvisionedThroughputExceededException
  51. sleep(2)
  52.  
  53. df = pd.DataFrame(extracted_data)
  54. df = df.drop_duplicates()
  55. df.to_csv('output.csv')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement