Get Searchable PDF from Azure Document Intelligence Service

Fred 75 Reputation points
2025-09-25T23:42:51.96+00:00

I'm trying to use python to convert a scanned PDF into a searchable PDF. I see here that it is supported but I am looking for a code example or sample on how to do this. Where I am struggling is the request seems to require a request_id but I am unclear on how to get that.

I have tried using the document intelligence service. The below code is not working

from loguru import logger
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence.models import (
    AnalyzeDocumentRequest,
)
from azure.ai.documentintelligence import DocumentIntelligenceClient
import os
def extract_pdf(
    file: str,
    # tables: DBTables,
    use_cache: bool = True,
    text_model: str = "prebuilt-read",
) -> str:
    """creates markdown formatted text"""
    logger.info("=======Starting Text Extraction=======")
    filename, file_extension = os.path.splitext(file)
    out = filename + ".ocr.pdf"
    if use_cache:
        logger.debug("Checking cache")
        if os.path.isfile(out):
            logger.debug("Results found in cache, done")
            with open(out, "r") as f:
                return f.read()
        logger.debug("Results not in cache, extracting")
    logger.debug("running poller")
    credential = AzureKeyCredential("MYKEY")
    analysis_client = DocumentIntelligenceClient(
        "MYENDPOINT", credential
    )
    with open(file, "rb") as f:
        poller = analysis_client.begin_analyze_document(
            text_model,
            AnalyzeDocumentRequest(bytes_source=f.read()),
            output=["pdf"],
        )
    result = poller.result()
    print(result)
    with open(out, "wb") as f:
        f.write(result)
    return out

Any guidance or a working example would be appreciated

Azure AI Document Intelligence
{count} votes

Answer accepted by question author
  1. Moritz Goeke 395 Reputation points MVP
    2025-09-26T00:52:54.49+00:00

    Hello Fred,

    I attached a code sample that works for the given case (at least for me :D):

    from azure.core.credentials import AzureKeyCredential
    from azure.ai.documentintelligence import DocumentIntelligenceClient
    from azure.ai.documentintelligence.models import AnalyzeOutputOption
    import os
    
    def to_searchable_pdf(src_path: str, dst_path: str, endpoint: str, api_key: str) -> str:
        client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(api_key))
    
        # 1) Submit analysis with prebuilt-read and ask for PDF output
        with open(src_path, "rb") as f:
            poller = client.begin_analyze_document(
                model_id="prebuilt-read",
                body=f,                                # file input
                output=[AnalyzeOutputOption.PDF],      # request searchable PDF
            )
    
        # 2) Wait for completion (this gives AnalyzeResult, not the PDF bytes)
        result = poller.result()
    
        # 3) Get the operation/result ID
        result_id = poller.details["operation_id"]
    
        # 4) Download the searchable PDF bytes and save to disk
        pdf_stream = client.get_analyze_result_pdf(model_id=result.model_id, result_id=result_id)
        with open(dst_path, "wb") as out:
            out.writelines(pdf_stream)
    
        return dst_path
    
    
    if __name__ == "__main__":
        print("Hello :)")  # test print at start
    
        # Example execution (replace with your own values)
        endpoint = "https://<your>.cognitiveservices.azure.com/"
        api_key = "<your_api_key>"
    
        src_pdf = "scan.pdf"
        dst_pdf = "scan.searchable.pdf"
    
        output_file = to_searchable_pdf(src_pdf, dst_pdf, endpoint, api_key)
        print(f"Searchable PDF saved at: {output_file}")
    

    Hope that helps a bit :).

    Best regards,
    Moritz

    1 person found this answer helpful.

0 additional answers

Sort by: Most helpful

Your answer

Answers can be marked as 'Accepted' by the question author and 'Recommended' by moderators, which helps users know the answer solved the author's problem.