How to sequence OCR, Image Analysis, Merge, and Embedding skills in an Azure AI Search skillset?

Matteo Doni 60 Reputation points
2025-03-18T14:49:44.85+00:00

I am writing an Azure skillset and I want to use: Skills.Vision.OcrSkill, Vision.ImageAnalysisSkill, SplitSkill, and AzureOpenAIEmbeddingSkill. I am not sure what the correct order is to use these skills. In my skillset, I first use Skills.Vision.OcrSkill and Vision.ImageAnalysisSkill, then use Text.MergeSkill twice to merge their outputs with the document content. After that, I chunk the output of the previous operation and finally use Skills.Text.AzureOpenAIEmbeddingSkill. Is this logic correct?


{

  "@odata.etag": "\"mioetag\"",

  "name": "mioskillset",

  "skills": [

    {

      "@odata.type": "#Microsoft.Skills.Util.DocumentExtractionSkill",

      "name": "#1",

      "description": "It extracts text and metadata from documents and applies OCR to images",

      "context": "/document",

      "parsingMode": "default",

      "dataToExtract": "contentAndMetadata",

      "inputs": [

        {

          "name": "file_data",

          "source": "/document/file_data",

          "inputs": []

        }

      ],

      "outputs": [

        {

          "name": "content",

          "targetName": "extracted_text"

        },

        {

          "name": "normalized_images",

          "targetName": "extracted_normalized_images"

        }

      ],

      "configuration": {

        "imageAction": "generateNormalizedImages",

        "******@odata.type": "#Int64",

        "normalizedImageMaxWidth": 2000,

        "******@odata.type": "#Int64",

        "normalizedImageMaxHeight": 2000

      }

    },

    {

      "description": "Extract text (plain and structured) from image.",

      "@odata.type": "#Microsoft.Skills.Vision.OcrSkill",

      "context": "/document/extracted_normalized_images/*",

      "defaultLanguageCode": "en",

      "detectOrientation": true,

      "inputs": [

        {

          "name": "image",

          "source": "/document/extracted_normalized_images/*"

        }

      ],

      "outputs": [

        {

          "name": "text"

        }

      ]

    },

    {

      "@odata.type": "#Microsoft.Skills.Text.MergeSkill",

      "description": "Create merged_text, which includes all the textual representation of each image inserted at the right location in the content field.",

      "context": "/document",

      "insertPreTag": " ",

      "insertPostTag": " ",

      "inputs": [

        {

          "name": "text",

          "source": "/document/content"

        },

        {

          "name": "itemsToInsert",

          "source": "/document/extracted_normalized_images/*/text"

        },

        {

          "name": "offsets",

          "source": "/document/extracted_normalized_images/*/contentOffset"

        }

      ],

      "outputs": [

        {

          "name": "mergedText",

          "targetName": "merged_text"

        },

        {

          "name": "mergedOffsets",

          "targetName": "first_mergedOffsets"

        }

      ]

    },

    {

      "@odata.type": "#Microsoft.Skills.Vision.ImageAnalysisSkill",

      "context": "/document/extracted_normalized_images/*",

      "visualFeatures": [

        "tags",

        "description"

      ],

      "inputs": [

        {

          "name": "image",

          "source": "/document/extracted_normalized_images/*"

        }

      ],

      "outputs": [

        {

          "name": "adult"

        },

        {

          "name": "brands"

        },

        {

          "name": "categories"

        },

        {

          "name": "description"

        },

        {

          "name": "faces"

        },

        {

          "name": "objects"

        },

        {

          "name": "tags"

        }

      ]

    },

    {

      "@odata.type": "#Microsoft.Skills.Text.MergeSkill",

      "description": "Create merged_text, which includes all the textual representation of each image inserted at the right location in the content field.",

      "context": "/document",

      "insertPreTag": " ",

      "insertPostTag": " ",

      "inputs": [

        {

          "name": "text",

          "source": "/document/merged_test"

        },

        {

          "name": "itemsToInsert",

          "source": "/document/extracted_normalized_images/*/description"

        },

        {

          "name": "offsets",

          "source": "/document/first_mergedOffsets"

        }

      ],

      "outputs": [

        {

          "name": "mergedText",

          "targetName": "final_merged_text"

        }

      ]

    },

    {

      "@odata.type": "#Microsoft.Skills.Text.SplitSkill",

      "name": "#2",

      "description": "It splits the text into overlapping segments for vectorization, with maximumPageLength set for Ada and textSplitMode configured to avoid breaking pages",

      "context": "/document/merged_text",

      "defaultLanguageCode": "en",

      "textSplitMode": "pages",

      "maximumPageLength": 3000,

      "pageOverlapLength": 100,

      "maximumPagesToTake": 0,

      "unit": "characters",

      "inputs": [

        {

          "name": "text",

          "source": "/document/final_merged_text",

          "inputs": []

        }

      ],

      "outputs": [

        {

          "name": "textItems",

          "targetName": "pages"

        }

      ]

    },

    {

      "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",

      "name": "#3",

      "description": "It vectorizes the text for semantic search",

      "context": "/document/merged_text/pages/*",

      "resourceUri": "https://mioservizio.openai.azure.com",

      "apiKey": "<redacted>",

      "deploymentId": "text-embedding-ada-002",

      "dimensions": 1536,

      "modelName": "text-embedding-ada-002",

      "inputs": [

        {

          "name": "text",

          "source": "/document/merged_text/pages/*",

          "inputs": []

        }

      ],

      "outputs": [

        {

          "name": "embedding",

          "targetName": "embeddingVec"

        }

      ]

    }

  ],

  "indexProjections": {

    "selectors": [

      {

        "targetIndexName": "mioindice",

        "parentKeyFieldName": "parent_id",

        "sourceContext": "/document/merged_text/pages/*",

        "mappings": [

          {

            "name": "content",

            "source": "/document/merged_text/pages/*",

            "inputs": []

          },

          {

            "name": "contentVector",

            "source": "/document/merged_text/pages/*/embeddingVec",

            "inputs": []

          },

          {

            "name": "title",

            "source": "/document/metadata_storage_name",

            "inputs": []

          },

          {

            "name": "url",

            "source": "/document/metadata_storage_path",

            "inputs": []

          },

          {

            "name": "filepath",

            "source": "/document/metadata_storage_path",

            "inputs": []

          },

          {

            "name": "timestamp",

            "source": "/document/metadata_storage_last_modified",

            "inputs": []

          },

          {

            "name": "chat_id",

            "source": "/document/chat_id",

            "inputs": []

          },

          {

            "name": "sas_token",

            "source": "/document/sas_token",

            "inputs": [ ]

          }

        ]

      }

    ],

    "parameters": {

      "projectionMode": "skipIndexingParentDocuments"

    }

  }

}

I appreciate any suggestions.

Azure AI services
Azure AI services
A group of Azure services, SDKs, and APIs designed to make apps more intelligent, engaging, and discoverable.
3,598 questions
{count} votes

Accepted answer
  1. Amira Bedhiafi 33,071 Reputation points Volunteer Moderator
    2025-03-18T21:44:20.1166667+00:00

    Hello Matteo !

    Thank you for posting on Microsoft Learn

    Your skillset's logical flow is mostly correct, but I see a few issues that could be improved:

    The second merge references /document/merged_test, which doesn't exist in your JSON. It should reference /document/merged_text.

    Your SplitSkill uses context": "/document/merged_text", but after merging, the correct source should be /document/final_merged_text.

    It should be "/document/pages/*" instead of "/document/merged_text/pages/*" since the SplitSkill produces /document/pages/*.

    The order should be like below :

    • **OcrSkill **→ Extract text from images
    • ImageAnalysisSkill → Extract descriptions, tags, and objects.
    • Merge OCR-extracted text with document content
    • Merge image descriptions with the previously merged text
    • Chunk final merged text for embedding
    • Generate embeddings for search.
    0 comments No comments

0 additional answers

Sort by: Most helpful

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.