How to sequence OCR, Image Analysis, Merge, and Embedding skills in an Azure AI Search skillset?

Question

How to sequence OCR, Image Analysis, Merge, and Embedding skills in an Azure AI Search skillset?

Matteo Doni 60

I am writing an Azure skillset and I want to use: Skills.Vision.OcrSkill, Vision.ImageAnalysisSkill, SplitSkill, and AzureOpenAIEmbeddingSkill. I am not sure what the correct order is to use these skills. In my skillset, I first use Skills.Vision.OcrSkill and Vision.ImageAnalysisSkill, then use Text.MergeSkill twice to merge their outputs with the document content. After that, I chunk the output of the previous operation and finally use Skills.Text.AzureOpenAIEmbeddingSkill. Is this logic correct?


{

  "@odata.etag": "\"mioetag\"",

  "name": "mioskillset",

  "skills": [

    {

      "@odata.type": "#Microsoft.Skills.Util.DocumentExtractionSkill",

      "name": "#1",

      "description": "It extracts text and metadata from documents and applies OCR to images",

      "context": "/document",

      "parsingMode": "default",

      "dataToExtract": "contentAndMetadata",

      "inputs": [

        {

          "name": "file_data",

          "source": "/document/file_data",

          "inputs": []

        }

      ],

      "outputs": [

        {

          "name": "content",

          "targetName": "extracted_text"

        },

        {

          "name": "normalized_images",

          "targetName": "extracted_normalized_images"

        }

      ],

      "configuration": {

        "imageAction": "generateNormalizedImages",

        "******@odata.type": "#Int64",

        "normalizedImageMaxWidth": 2000,

        "******@odata.type": "#Int64",

        "normalizedImageMaxHeight": 2000

      }

    },

    {

      "description": "Extract text (plain and structured) from image.",

      "@odata.type": "#Microsoft.Skills.Vision.OcrSkill",

      "context": "/document/extracted_normalized_images/*",

      "defaultLanguageCode": "en",

      "detectOrientation": true,

      "inputs": [

        {

          "name": "image",

          "source": "/document/extracted_normalized_images/*"

        }

      ],

      "outputs": [

        {

          "name": "text"

        }

      ]

    },

    {

      "@odata.type": "#Microsoft.Skills.Text.MergeSkill",

      "description": "Create merged_text, which includes all the textual representation of each image inserted at the right location in the content field.",

      "context": "/document",

      "insertPreTag": " ",

      "insertPostTag": " ",

      "inputs": [

        {

          "name": "text",

          "source": "/document/content"

        },

        {

          "name": "itemsToInsert",

          "source": "/document/extracted_normalized_images/*/text"

        },

        {

          "name": "offsets",

          "source": "/document/extracted_normalized_images/*/contentOffset"

        }

      ],

      "outputs": [

        {

          "name": "mergedText",

          "targetName": "merged_text"

        },

        {

          "name": "mergedOffsets",

          "targetName": "first_mergedOffsets"

        }

      ]

    },

    {

      "@odata.type": "#Microsoft.Skills.Vision.ImageAnalysisSkill",

      "context": "/document/extracted_normalized_images/*",

      "visualFeatures": [

        "tags",

        "description"

      ],

      "inputs": [

        {

          "name": "image",

          "source": "/document/extracted_normalized_images/*"

        }

      ],

      "outputs": [

        {

          "name": "adult"

        },

        {

          "name": "brands"

        },

        {

          "name": "categories"

        },

        {

          "name": "description"

        },

        {

          "name": "faces"

        },

        {

          "name": "objects"

        },

        {

          "name": "tags"

        }

      ]

    },

    {

      "@odata.type": "#Microsoft.Skills.Text.MergeSkill",

      "description": "Create merged_text, which includes all the textual representation of each image inserted at the right location in the content field.",

      "context": "/document",

      "insertPreTag": " ",

      "insertPostTag": " ",

      "inputs": [

        {

          "name": "text",

          "source": "/document/merged_test"

        },

        {

          "name": "itemsToInsert",

          "source": "/document/extracted_normalized_images/*/description"

        },

        {

          "name": "offsets",

          "source": "/document/first_mergedOffsets"

        }

      ],

      "outputs": [

        {

          "name": "mergedText",

          "targetName": "final_merged_text"

        }

      ]

    },

    {

      "@odata.type": "#Microsoft.Skills.Text.SplitSkill",

      "name": "#2",

      "description": "It splits the text into overlapping segments for vectorization, with maximumPageLength set for Ada and textSplitMode configured to avoid breaking pages",

      "context": "/document/merged_text",

      "defaultLanguageCode": "en",

      "textSplitMode": "pages",

      "maximumPageLength": 3000,

      "pageOverlapLength": 100,

      "maximumPagesToTake": 0,

      "unit": "characters",

      "inputs": [

        {

          "name": "text",

          "source": "/document/final_merged_text",

          "inputs": []

        }

      ],

      "outputs": [

        {

          "name": "textItems",

          "targetName": "pages"

        }

      ]

    },

    {

      "@odata.type": "#Microsoft.Skills.Text.AzureOpenAIEmbeddingSkill",

      "name": "#3",

      "description": "It vectorizes the text for semantic search",

      "context": "/document/merged_text/pages/*",

      "resourceUri": "https://mioservizio.openai.azure.com",

      "apiKey": "<redacted>",

      "deploymentId": "text-embedding-ada-002",

      "dimensions": 1536,

      "modelName": "text-embedding-ada-002",

      "inputs": [

        {

          "name": "text",

          "source": "/document/merged_text/pages/*",

          "inputs": []

        }

      ],

      "outputs": [

        {

          "name": "embedding",

          "targetName": "embeddingVec"

        }

      ]

    }

  ],

  "indexProjections": {

    "selectors": [

      {

        "targetIndexName": "mioindice",

        "parentKeyFieldName": "parent_id",

        "sourceContext": "/document/merged_text/pages/*",

        "mappings": [

          {

            "name": "content",

            "source": "/document/merged_text/pages/*",

            "inputs": []

          },

          {

            "name": "contentVector",

            "source": "/document/merged_text/pages/*/embeddingVec",

            "inputs": []

          },

          {

            "name": "title",

            "source": "/document/metadata_storage_name",

            "inputs": []

          },

          {

            "name": "url",

            "source": "/document/metadata_storage_path",

            "inputs": []

          },

          {

            "name": "filepath",

            "source": "/document/metadata_storage_path",

            "inputs": []

          },

          {

            "name": "timestamp",

            "source": "/document/metadata_storage_last_modified",

            "inputs": []

          },

          {

            "name": "chat_id",

            "source": "/document/chat_id",

            "inputs": []

          },

          {

            "name": "sas_token",

            "source": "/document/sas_token",

            "inputs": [ ]

          }

        ]

      }

    ],

    "parameters": {

      "projectionMode": "skipIndexingParentDocuments"

    }

  }

}

I appreciate any suggestions.

Saideep Anchuri 9,500 Reputation points Moderator

2025-03-19T07:22:28.12+00:00

Hello Matteo !

Just checking in to see if the below answer provided by @Amira Bedhiafi helped.

Thank You.

Accepted answer

0 additional answers

Your answer

Saideep Anchuri 9,500 Reputation points Moderator

2025-03-19T07:22:28.12+00:00

Hello Matteo !

Just checking in to see if the below answer provided by @Amira Bedhiafi helped.

Thank You.

Answer 1

Hello Matteo !

Thank you for posting on Microsoft Learn

Your skillset's logical flow is mostly correct, but I see a few issues that could be improved:

The second merge references /document/merged_test, which doesn't exist in your JSON. It should reference /document/merged_text.

Your SplitSkill uses context": "/document/merged_text", but after merging, the correct source should be /document/final_merged_text.

It should be "/document/pages/*" instead of "/document/merged_text/pages/*" since the SplitSkill produces /document/pages/*.

The order should be like below :

**OcrSkill **→ Extract text from images
ImageAnalysisSkill → Extract descriptions, tags, and objects.
Merge OCR-extracted text with document content
Merge image descriptions with the previously merged text
Chunk final merged text for embedding
Generate embeddings for search.

Share via

How to sequence OCR, Image Analysis, Merge, and Embedding skills in an Azure AI Search skillset?

0 additional answers

Your answer