Hello Phuong Anh Dinh,
Welcome to the Microsoft Q&A and thank you for posting your questions here.
I understand that you would like to get page number of uploaded document.
The best strategy that works is to extract text and page numbers by getting correct page numbers from Chunk IDs and integrate it with Azure AI Search and OpenAI setup.
- Use a Python library like PyMuP or fitz to extract text that you can match with the content in your chunks, example is here: https://github.com/pymupdf/PyMuPDF
import fitz # PyMuPDF
import re
def extract_text_from_pdf(pdf_path):
doc = fitz.open(pdf_path)
page_texts = {}
for page_num in range(len(doc)):
page = doc.load_page(page_num)
page_text = page.get_text("text")
page_texts[page_num + 1] = page_text # Page numbers are 1-based
return page_texts
def match_chunk_to_page(chunk_content, page_texts):
for page_num, text in page_texts.items():
if chunk_content in text:
return page_num
return None
# Example usage
pdf_path = "path_to_your_pdf_file.pdf"
chunk_content = "your_chunk_content_here"
page_texts = extract_text_from_pdf(pdf_path)
page_number = match_chunk_to_page(chunk_content, page_texts)
if page_number:
print(f"Chunk content found on page {page_number}")
else:
print("Chunk content not found in the PDF")
- Use the sample code below to integrate the page number extraction into your existing code:
Review the code to match your logic.import os from azure.ai.openai import AzureOpenAI from dotenv import load_dotenv import fitz # PyMuPDF def extract_text_from_pdf(pdf_path): doc = fitz.open(pdf_path) page_texts = {} for page_num in range(len(doc)): page = doc.load_page(page_num) page_text = page.get_text("text") page_texts[page_num + 1] = page_text # Page numbers are 1-based return page_texts def match_chunk_to_page(chunk_content, page_texts): for page_num, text in page_texts.items(): if chunk_content in text: return page_num return None def main(): try: show_citations = True load_dotenv() azure_oai_endpoint = os.getenv("AZURE_OAI_ENDPOINT") azure_oai_key = os.getenv("AZURE_OAI_KEY") azure_oai_deployment = os.getenv("AZURE_OAI_DEPLOYMENT") azure_search_endpoint = os.getenv("AZURE_SEARCH_ENDPOINT") azure_search_key = os.getenv("AZURE_SEARCH_KEY") azure_search_index = os.getenv("AZURE_SEARCH_INDEX") client = AzureOpenAI( azure_endpoint=azure_oai_endpoint, api_key=azure_oai_key, api_version="2024-05-01-preview", ) # Get the prompt text = input('\nEnter a question:\n') completion = client.chat.completions.create( model=azure_oai_deployment, messages=[ { "role": "user", "content": text } ], max_tokens=800, temperature=0.7, top_p=0.95, frequency_penalty=0, presence_penalty=0, stop=None, stream=False, extra_body={ "data_sources": [{ "type": "azure_search", "parameters": { "endpoint": f"{azure_search_endpoint}", "index_name": "test-index", "semantic_configuration": "default", "query_type": "vector_semantic_hybrid", "fields_mapping": { "content_fields_separator": "\n", "content_fields": [ "content" ], "filepath_field": "chunk_id", "title_field": "title", "url_field": "url", "vector_fields": [ "contentVector" ] }, "in_scope": True, "role_information": "You are a helpful for finding information", "filter": None, "strictness": 3, "top_n_documents": 5, "authentication": { "type": "api_key", "key": f"{azure_search_key}" }, "embedding_dependency": { "type": "deployment_name", "deployment_name": "text-embedding-ada-002" } } }] } ) # Example usage of page number extraction pdf_path = "path_to_your_pdf_file.pdf" chunk_content = "your_chunk_content_here" page_texts = extract_text_from_pdf(pdf_path) page_number = match_chunk_to_page(chunk_content, page_texts) if page_number: print(f"Chunk content found on page {page_number}") else: print("Chunk content not found in the PDF") except Exception as e: print(f"An error occurred: {e}") if __name__ == "__main__": main()
I hope this is helpful! Do not hesitate to let me know if you have any other questions.
Please don't forget to close up the thread here by upvoting and accept it as an answer if it is helpful