How to get from which page number of uploaded document. I use Azure AI search index and Azure Open Ai to build model.

Question

How to get from which page number of uploaded document. I use Azure AI search index and Azure Open Ai to build model.

Phuong Anh Dinh 0

I upload my document ( a pdf file) to Azure blob storage. Then I use Azure AI Search to import and vectorize the file, and it create an search index. When I run the Azure open AI model on VS code, which connect and retrieve this search index. I choose to return the citations for the retrieved document.

The code I write for azure open ai model:

def main():
    
    try:
        
        show_citations = True

        load_dotenv()
        azure_oai_endpoint = os.getenv("AZURE_OAI_ENDPOINT")
        azure_oai_key = os.getenv("AZURE_OAI_KEY")
        azure_oai_deployment = os.getenv("AZURE_OAI_DEPLOYMENT")
        azure_search_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
        azure_search_key = os.getenv("AZURE_SEARCH_KEY")
        azure_search_index = os.getenv("AZURE_SEARCH_INDEX")


              
        client = AzureOpenAI(
            azure_endpoint=azure_oai_endpoint,
            api_key=azure_oai_key,
  
            api_version="2024-05-01-preview",
        )

        #Get the prompt
        text= input('\nEnter a question:\n')
              
        completion = client.chat.completions.create(
            model=azure_oai_deployment,
            messages= [
            {
              "role": "user",
              "content": text
            }],
            max_tokens=800,
            temperature=0.7,
            top_p=0.95,
            frequency_penalty=0,
            presence_penalty=0,
            stop=None,
            stream=False,
            extra_body={
              "data_sources": [{
                  "type": "azure_search",
                  "parameters": {
                    "endpoint": f"{azure_search_endpoint}",
                    "index_name": "test-index",
                    "semantic_configuration": "default",
                    "query_type": "vector_semantic_hybrid",
                    #"query_type": "semantic",
                    "fields_mapping": {
                      "content_fields_separator": "\n",
                      "content_fields": [
                        "content"
                      ],
                      "filepath_field": "chunk_id",
                      "title_field": "title",
                      "url_field": "url",
                      "vector_fields": [
                        "contentVector"
                      ]
                    },
                    "in_scope": True,
                    "role_information": "You are a helpful for finding information",
                    "filter": None,
                    "strictness": 3,
                    "top_n_documents": 5,
                    "authentication": {
                      "type": "api_key",
                      "key": f"{azure_search_key}"                                                                                                                                                      
                    },
                    "embedding_dependency": {
                      "type": "deployment_name",
                      "deployment_name": "text-embedding-ada-002"
                    }
                  }
                }]
            }
        )

I can see the page number at the end of chunk_id like below:

Chunk ID: 0ffd977ce09f_aHR0cHM6Ly9hbmh0cmFpbnN0b3JhZ2VhY2NvdW50LmJsb2IuY29yZS53aW5kb3dzLm5ldC9maWxldXBsb2FkLXRlc3QtaW5kZXgvQmx1ZV9aZWJyYV9MYW5kbG9yZF9JbnN1cmFuY2VfQWNjaWRlbnRhbF9EYW1hZ2VfUERTXzIwMjMwNzAxLnBkZg2_pages_19

However, when I check the content of these chunks with pdf file, the page number is not correct for some chunks. For example, chunk content is in page 11 but the number in chunk id is in page 20.

Therefore, do you know any way/ skills to extract the page number separately from chunkid and the page number is correct with the content in the pdf file. I want to return the page number correctly in the citation.

Thank you ,

1 answer

Your answer

Answer 1

Hello Phuong Anh Dinh,

Welcome to the Microsoft Q&A and thank you for posting your questions here.

I understand that you would like to get page number of uploaded document.

The best strategy that works is to extract text and page numbers by getting correct page numbers from Chunk IDs and integrate it with Azure AI Search and OpenAI setup.

Use a Python library like PyMuP or fitz to extract text that you can match with the content in your chunks, example is here: https://github.com/pymupdf/PyMuPDF

   import fitz  # PyMuPDF
   import re
   def extract_text_from_pdf(pdf_path):
       doc = fitz.open(pdf_path)
       page_texts = {}
       for page_num in range(len(doc)):
           page = doc.load_page(page_num)
           page_text = page.get_text("text")
           page_texts[page_num + 1] = page_text  # Page numbers are 1-based
       return page_texts
   def match_chunk_to_page(chunk_content, page_texts):
       for page_num, text in page_texts.items():
           if chunk_content in text:
               return page_num
       return None
   # Example usage
   pdf_path = "path_to_your_pdf_file.pdf"
   chunk_content = "your_chunk_content_here"
   page_texts = extract_text_from_pdf(pdf_path)
   page_number = match_chunk_to_page(chunk_content, page_texts)
   if page_number:
       print(f"Chunk content found on page {page_number}")
   else:
       print("Chunk content not found in the PDF")

Use the sample code below to integrate the page number extraction into your existing code:

   import os
   from azure.ai.openai import AzureOpenAI
   from dotenv import load_dotenv
   import fitz  # PyMuPDF
   def extract_text_from_pdf(pdf_path):
       doc = fitz.open(pdf_path)
       page_texts = {}
       for page_num in range(len(doc)):
           page = doc.load_page(page_num)
           page_text = page.get_text("text")
           page_texts[page_num + 1] = page_text  # Page numbers are 1-based
       return page_texts
   def match_chunk_to_page(chunk_content, page_texts):
       for page_num, text in page_texts.items():
           if chunk_content in text:
               return page_num
       return None
   def main():
       try:
           show_citations = True
           load_dotenv()
           azure_oai_endpoint = os.getenv("AZURE_OAI_ENDPOINT")
           azure_oai_key = os.getenv("AZURE_OAI_KEY")
           azure_oai_deployment = os.getenv("AZURE_OAI_DEPLOYMENT")
           azure_search_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT")
           azure_search_key = os.getenv("AZURE_SEARCH_KEY")
           azure_search_index = os.getenv("AZURE_SEARCH_INDEX")
           client = AzureOpenAI(
               azure_endpoint=azure_oai_endpoint,
               api_key=azure_oai_key,
               api_version="2024-05-01-preview",
           )
           # Get the prompt
           text = input('\nEnter a question:\n')
           completion = client.chat.completions.create(
               model=azure_oai_deployment,
               messages=[
                   {
                       "role": "user",
                       "content": text
                   }
               ],
               max_tokens=800,
               temperature=0.7,
               top_p=0.95,
               frequency_penalty=0,
               presence_penalty=0,
               stop=None,
               stream=False,
               extra_body={
                   "data_sources": [{
                       "type": "azure_search",
                       "parameters": {
                           "endpoint": f"{azure_search_endpoint}",
                           "index_name": "test-index",
                           "semantic_configuration": "default",
                           "query_type": "vector_semantic_hybrid",
                           "fields_mapping": {
                               "content_fields_separator": "\n",
                               "content_fields": [
                                   "content"
                               ],
                               "filepath_field": "chunk_id",
                               "title_field": "title",
                               "url_field": "url",
                               "vector_fields": [
                                   "contentVector"
                               ]
                           },
                           "in_scope": True,
                           "role_information": "You are a helpful for finding information",
                           "filter": None,
                           "strictness": 3,
                           "top_n_documents": 5,
                           "authentication": {
                               "type": "api_key",
                               "key": f"{azure_search_key}"
                           },
                           "embedding_dependency": {
                               "type": "deployment_name",
                               "deployment_name": "text-embedding-ada-002"
                           }
                       }
                   }]
               }
           )
           # Example usage of page number extraction
           pdf_path = "path_to_your_pdf_file.pdf"
           chunk_content = "your_chunk_content_here"
           page_texts = extract_text_from_pdf(pdf_path)
           page_number = match_chunk_to_page(chunk_content, page_texts)
           if page_number:
               print(f"Chunk content found on page {page_number}")
           else:
               print("Chunk content not found in the PDF")
       except Exception as e:
           print(f"An error occurred: {e}")
   if __name__ == "__main__":
       main()

Review the code to match your logic.

I hope this is helpful! Do not hesitate to let me know if you have any other questions.

Please don't forget to close up the thread here by upvoting and accept it as an answer if it is helpful

Share via

How to get from which page number of uploaded document. I use Azure AI search index and Azure Open Ai to build model.

1 answer

Your answer