AWS Textract: How to Detect Signatures and Extract Text from Documents

Published On: 17 September 2024.By Rishabh Jhalani.

General
Cloud

AWS Textract: How to Detect Signatures and Extract Text from Documents

AWS Textract is a fully managed machine learning service by Amazon Web Services (AWS) that automatically extracts text, handwriting, and other data from scanned documents. Unlike traditional OCR (Optical Character Recognition) tools, AWS Textract goes beyond simple text extraction to identify forms, tables, signatures, and other key elements in documents.

Prerequisites

AWS Account: You need an AWS account with access to Textract.
AWS CLI: Install and configure the AWS Command Line Interface (CLI) with your credentials.
Boto3: Install the AWS SDK for Python using pip:

Detecting Form Values in a Document

import boto3
from trp import Document


def detect_text(file_name):
    # Initialize Textract client with explicit credentials
    textract = boto3.client('textract',
                            aws_access_key_id='your-access-key-id',
                            aws_secret_access_key='your-secret-access-key',
                            region_name='your-region'
                            )
    # Open the document file
    with open(file_name, 'rb') as document:
        image_bytes = document.read()

    # Call Textract to detect text
    response = textract.detect_document_text(Document={'Bytes': image_bytes}, FeatureTypes=["FORMS"])
    doc = Document(response)
    for page in doc.pages:
        for field in page.form.fields:
            print(f'Label: {field.key.text}')
            print(f'Value: {field.value.text}')
            print(f'Geometry Bounding Box: {field.value.geometry.boundingBox}')

# Usage
text_in_document = detect_text('path_to_your_document')

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

import boto3

from trp import Document

def detect_text(file_name):

# Initialize Textract client with explicit credentials

textract = boto3.client('textract',

aws_access_key_id='your-access-key-id',

aws_secret_access_key='your-secret-access-key',

region_name='your-region'

)

# Open the document file

with open(file_name, 'rb') as document:

image_bytes = document.read()

# Call Textract to detect text

response = textract.detect_document_text(Document={'Bytes': image_bytes}, FeatureTypes=["FORMS"])

doc = Document(response)

for page in doc.pages:

for field in page.form.fields:

print(f'Label: {field.key.text}')

print(f'Value: {field.value.text}')

print(f'Geometry Bounding Box: {field.value.geometry.boundingBox}')

# Usage

text_in_document = detect_text('path_to_your_document')

Detecting Signatures in a Document

import boto3

def detect_signatures(file_name):
    # Initialize Textract client with explicit credentials
    textract = boto3.client('textract',
                            aws_access_key_id='your-access-key-id',
                            aws_secret_access_key='your-secret-access-key',
                            region_name='your-region'
                            )
    # Open the document file
    with open(file_name, 'rb') as document:
        image_bytes = document.read()

    # Call Textract to detect signature
    response = textract.detect_document_text(Document={'Bytes': image_bytes}, FeatureTypes=["SIGNATURES"])

    for item in response["Blocks"]:
        if item["BlockType"] == "SIGNATURE":
            print(f"Confidence Score: {item['Confidence']}")
            print(f"Geometry Bounding Box {item['Geometry']['BoundingBox']}")

# Usage
sign_in_document = detect_signatures('path_to_your_document')

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

import boto3

def detect_signatures(file_name):

# Initialize Textract client with explicit credentials

textract = boto3.client('textract',

aws_access_key_id='your-access-key-id',

aws_secret_access_key='your-secret-access-key',

region_name='your-region'

)

# Open the document file

with open(file_name, 'rb') as document:

image_bytes = document.read()

# Call Textract to detect signature

response = textract.detect_document_text(Document={'Bytes': image_bytes}, FeatureTypes=["SIGNATURES"])

for item in response["Blocks"]:

if item["BlockType"] == "SIGNATURE":

print(f"Confidence Score: {item['Confidence']}")

print(f"Geometry Bounding Box {item['Geometry']['BoundingBox']}")

# Usage

sign_in_document = detect_signatures('path_to_your_document')

Conclusion

AWS Textract, paired with Python, makes it easy to create robust document processing applications. Text detection is pretty straightforward, but signature detection needs a bit of extra logic to identify possible signature areas. With these tools, you can automate a lot of the manual work involved in handling documents.