Error in Azure AI Search with OCR Skill

Violet Zeng 30 Reputation points
2024-09-27T04:44:38.2233333+00:00

Hi, I was creating an AI Search index to on my png files. I followed the instruction from the below sources, but still getting errors. Please help me out on where I missed in the code.

I got the following errors when running the indexer.

The data field 'myLayoutText' in the document with key 'xxxxx' has an invalid value of type 'Collection(Edm.ComplexType)' ('JSON arrays with element type 'Object' map to Collection(Edm.ComplexType)'). The expected type was 'Edm.ComplexType'.

Here is my code:

headers = {
    'Content-Type':'application/json',
    'api-key':search_key
}

data_source_name = 'document-bot-files-datasource'


index_name = 'ocr-index'
index_data = {
        "name" : index_name,
        "fields": [
            { "name": "ID", "type": "Edm.String", "key": True, "searchable": True,"filterable": True, "sortable": True,"facetable":True,"analyzer":"keyword" },
            { "name": "parent_id", "type": "Edm.String", "searchable": True, "filterable": True, "sortable": False,"facetable":False},
        
            { "name": "myText", "type": "Edm.String", "searchable": True, "filterable": False, "sortable": False,"facetable":False},
            { "name": "myLayoutText", "type": "Edm.ComplexType","fields":[
                { "name": "language", "type": "Edm.String", "searchable": False, "filterable": False, "sortable": False,"facetable":False},
                { "name": "text", "type": "Edm.String", "searchable": True, "filterable": False, "sortable": False,"facetable":False},
                { "name": "lines", "type": "Collection(Edm.ComplexType)","fields":[
                    { "name": "boundingBox", "type": "Collection(Edm.ComplexType)", "fields":[
                        { "name": "x", "type": "Edm.Int32", "searchable": False, "filterable": False, "sortable": False,"facetable":False},
                        { "name": "y", "type": "Edm.Int32", "searchable": False, "filterable": False, "sortable": False,"facetable":False},
                    ]},
                    { "name": "text", "type": "Edm.String", "searchable": True, "filterable": False, "sortable": False,"facetable":False},
                ]},
                { "name": "words", "type": "Collection(Edm.ComplexType)","fields":[
                    { "name": "boundingBox", "type": "Collection(Edm.ComplexType)", "fields":[
                        { "name": "x", "type": "Edm.Int32", "searchable": False, "filterable": False, "sortable": False,"facetable":False},
                        { "name": "y", "type": "Edm.Int32", "searchable": False, "filterable": False, "sortable": False,"facetable":False},
                    ]},                    
                    { "name": "text", "type": "Edm.String", "searchable": True, "filterable": False, "sortable": False,"facetable":False},
                ]}
            ]},
        ],
        "vectorSearch": {
        "profiles": [
        {
            "name": "myHnswProfile",
            "algorithm": "myHnsw"
        }
        ],
        "algorithms": [
        {
            "name": "myHnsw",
            "kind": "hnsw",
            "hnswParameters": {
            "m": 4,
            "metric": "cosine"
            }
        }
        ]
    }
    }
index_delete_response = requests.delete(f"{search_endpoint}/indexes/{index_name}?api-version=2020-06-30",headers=headers)
index_response = requests.put(f"{search_endpoint}/indexes('{index_name}')?api-version=2023-11-01",json=index_data,headers=headers)
print("index creation:", index_response.text)

skill_name = 'ocr-skillset'
skill_data = {
  "skills": [
    {
      "description": "Extracts text (plain and structured) from image.",
      "@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
      "context": "/document/normalized_images/*",
      "defaultLanguageCode": '',
      "detectOrientation": True,
      "inputs": [
        {
          "name": "image",
          "source": "/document/normalized_images/*"
        }
      ],
      "outputs": [
        {
          "name": "text",
          "targetName": "myText"
        },
        {
          "name": "layoutText",
          "targetName": "myLayoutText"
        }
      ],
    }
  ]
}
skill_update_response = requests.put(f"{search_endpoint}/skillsets('{skill_name}')?api-version=2023-10-01-Preview",json=skill_data,headers=headers)
print("skill creation update code:", skill_update_response.status_code)
# print("skill creation update code:", skill_update_response.text)

indexer_name = 'ocr-indexer'
indexer_data = {
    "name" : indexer_name,
    "dataSourceName" : data_source_name,
    "targetIndexName" : index_name,
    "skillsetName":skill_name,
    "parameters": {
        "configuration": {
            "indexedFileNameExtensions" : ".pdf,.docx,.txt,.png,.jpeg",
            "dataToExtract": "contentAndMetadata",
            "parsingMode": "default",
            "imageAction": "generateNormalizedImages"
        }
    },
    "schedule" : { },
    "fieldMappings" : [
    ],
    "outputFieldMappings": [
        {
            "sourceFieldName": "/document/normalized_images/*/myText",
            "targetFieldName": "myText"
        },
        {
            "sourceFieldName": "/document/normalized_images/*/myLayoutText",
            "targetFieldName": "myLayoutText"
        },
    ]
    }
indexer_update_response = requests.put(f"{search_endpoint}/indexers/{indexer_name}?api-version=2020-06-30",json=indexer_data,headers=headers)
print("indexer update  code: ",indexer_update_response.status_code)
indexer_reset_response = requests.post(f"{search_endpoint}/indexers/{indexer_name}/reset?api-version=2020-06-30",headers=headers)
print("indexer reset code: ",indexer_reset_response.status_code)
# time.sleep(3)
indexer_run_response = requests.post(f"{search_endpoint}/indexers/{indexer_name}/run?api-version=2020-06-30",headers=headers)
print("indexer rerun code: ",indexer_run_response.status_code)
Azure AI Search
Azure AI Search
An Azure search service with built-in artificial intelligence capabilities that enrich information to help identify and explore relevant content at scale.
991 questions
0 comments No comments
{count} votes

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.