@Karim Alameh Thanks for asking question.
I can configure the indexer with imageAction = generateNormalizedImagePerPage
to generate an array of normalized images where each page in the PDF is rendered to one output image. This also includes the original PDF page number. Passing this to the OcrSkill and then the AzureOpenAIEmbeddingSkill actually returns me the results I want. Remember there’s additional cost associated with OCR and the image extraction
As a reference check this sample skillset-
{
"@odata.context": "https://something-something.search.windows.net/$metadata#skillsets/$entity",
"@odata.etag": "\"0x8DC9DDEAB0DAC43\"",
"name": "something-something-skillset",
"description": "Skillset to chunk documents and generate embeddings",
"skills": [
{
"@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
"name": "#1",
"description": null,
"context": "/document/normalized_images/*",
"textExtractionAlgorithm": null,
"lineEnding": "Space",
"defaultLanguageCode": "en",
"detectOrientation": true,
"inputs": [
{
"name": "image",
"source": "/document/normalized_images/*"
}
],
"outputs": [
{
"name": "text",
"targetName": "text"
}
]
},
{
"@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",
"name": "#4",
"description": null,
"context": "/document/normalized_images/*",
"resourceUri": "https://dion-test-aoai.openai.azure.com",
"apiKey": "<redacted>",
"deploymentId": "text-embedding-ada-002",
"dimensions": 1536,
"modelName": "text-embedding-ada-002",
"inputs": [
{
"name": "text",
"source": "/document/normalized_images/*/text"
}
],
"outputs": [
{
"name": "embedding",
"targetName": "text_vector"
}
],
"authIdentity": null
}
],
"indexProjections": {
"selectors": [
{
"targetIndexName": "something-something",
"parentKeyFieldName": "parent_id",
"sourceContext": "/document/normalized_images/*",
"mappings": [
{
"name": "text_vector",
"source": "/document/normalized_images/*/text_vector",
"sourceContext": null,
"inputs": []
},
{
"name": "chunk",
"source": "/document/normalized_images/*/text",
"sourceContext": null,
"inputs": []
},
{
"name": "pageNumber",
"source": "/document/normalized_images/*/pageNumber",
"sourceContext": null,
"inputs": []
},
{
"name": "metadata_storage_path",
"source": "/document/metadata_storage_path",
"sourceContext": null,
"inputs": []
},
{
"name": "title",
"source": "/document/title",
"sourceContext": null,
"inputs": []
}
]
}
],
"parameters": {
"projectionMode": "skipIndexingParentDocuments"
}
},
"encryptionKey": null
}