Hi,
I want to extract data from all pages of the pdf. But I couldn't able to do it. Using this piece of code i just able to extract the first two pages data but as per my knowledge if we choose pages_to_analyze = None it should extract the all pages data but as per my case its working. Can anyone help me to fix this issue?
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from tabulate import tabulate
import json
import os
endpoint = " "
key = " "
local_document_path = r" "
document_name = os.path.splitext(os.path.basename(local_document_path))[0]
def analyze_document(endpoint, key, local_document_path):
try:
# Read the document content
with open(local_document_path, "rb") as f:
document_content = f.read()
# Initialize DocumentAnalysisClient
document_analysis_client = DocumentAnalysisClient(
endpoint=endpoint, credential=AzureKeyCredential(key)
)
pages_to_analyze = None
# Begin the analyze document operation
poller =
document_analysis_client.begin_analyze_document
(
"prebuilt-document", document=document_content, pages = pages_to_analyze
)
result = poller.result()
# Extract key-value pairs from the analyzed result
key_value_pairs = []
for kv_pair in result.key_value_pairs:
if kv_pair.key and kv_pair.value:
key_value_pairs.append({"Key": kv_pair.key.content, "Value": kv_pair.value.content})
# Extract tables from the analyzed result
tables = []
for table in result.tables:
doc_table_dict = table.to_dict()
headers = [cell['column_index'] for cell in doc_table_dict['cells'] if cell['row_index'] == 0]
rows = [[cell['content'] for cell in doc_table_dict['cells'] if cell['row_index'] == i] for i in range(1, doc_table_dict['row_count'])]
tables.append({"Headers": headers, "Rows": rows})
# Combine key-value pairs and tables into a single JSON structure
result_json = {"Key-Value Pairs": key_value_pairs, "Tables": tables}
return result_json
except Exception as e:
return {"Error": str(e)}
# Call the function
result_json = analyze_document(endpoint, key, local_document_path)
json_file_path = f" "
# Save the JSON structure to a file
with open(json_file_path, "w") as json_file:
json.dump(result_json, json_file, indent=2)
# Print a message indicating the file has been saved
print(f"JSON structure saved to {json_file_path}")