Hi, I was creating an AI Search index to on my png files. I followed the instruction from the below sources, but still getting errors. Please help me out on where I missed in the code.
I got the following errors when running the indexer.
The data field 'myLayoutText' in the document with key 'xxxxx' has an invalid value of type 'Collection(Edm.ComplexType)' ('JSON arrays with element type 'Object' map to Collection(Edm.ComplexType)'). The expected type was 'Edm.ComplexType'.
Here is my code:
headers = {
'Content-Type':'application/json',
'api-key':search_key
}
data_source_name = 'document-bot-files-datasource'
index_name = 'ocr-index'
index_data = {
"name" : index_name,
"fields": [
{ "name": "ID", "type": "Edm.String", "key": True, "searchable": True,"filterable": True, "sortable": True,"facetable":True,"analyzer":"keyword" },
{ "name": "parent_id", "type": "Edm.String", "searchable": True, "filterable": True, "sortable": False,"facetable":False},
{ "name": "myText", "type": "Edm.String", "searchable": True, "filterable": False, "sortable": False,"facetable":False},
{ "name": "myLayoutText", "type": "Edm.ComplexType","fields":[
{ "name": "language", "type": "Edm.String", "searchable": False, "filterable": False, "sortable": False,"facetable":False},
{ "name": "text", "type": "Edm.String", "searchable": True, "filterable": False, "sortable": False,"facetable":False},
{ "name": "lines", "type": "Collection(Edm.ComplexType)","fields":[
{ "name": "boundingBox", "type": "Collection(Edm.ComplexType)", "fields":[
{ "name": "x", "type": "Edm.Int32", "searchable": False, "filterable": False, "sortable": False,"facetable":False},
{ "name": "y", "type": "Edm.Int32", "searchable": False, "filterable": False, "sortable": False,"facetable":False},
]},
{ "name": "text", "type": "Edm.String", "searchable": True, "filterable": False, "sortable": False,"facetable":False},
]},
{ "name": "words", "type": "Collection(Edm.ComplexType)","fields":[
{ "name": "boundingBox", "type": "Collection(Edm.ComplexType)", "fields":[
{ "name": "x", "type": "Edm.Int32", "searchable": False, "filterable": False, "sortable": False,"facetable":False},
{ "name": "y", "type": "Edm.Int32", "searchable": False, "filterable": False, "sortable": False,"facetable":False},
]},
{ "name": "text", "type": "Edm.String", "searchable": True, "filterable": False, "sortable": False,"facetable":False},
]}
]},
],
"vectorSearch": {
"profiles": [
{
"name": "myHnswProfile",
"algorithm": "myHnsw"
}
],
"algorithms": [
{
"name": "myHnsw",
"kind": "hnsw",
"hnswParameters": {
"m": 4,
"metric": "cosine"
}
}
]
}
}
index_delete_response = requests.delete(f"{search_endpoint}/indexes/{index_name}?api-version=2020-06-30",headers=headers)
index_response = requests.put(f"{search_endpoint}/indexes('{index_name}')?api-version=2023-11-01",json=index_data,headers=headers)
print("index creation:", index_response.text)
skill_name = 'ocr-skillset'
skill_data = {
"skills": [
{
"description": "Extracts text (plain and structured) from image.",
"@odata.type": "#Microsoft.Skills.Vision.OcrSkill",
"context": "/document/normalized_images/*",
"defaultLanguageCode": '',
"detectOrientation": True,
"inputs": [
{
"name": "image",
"source": "/document/normalized_images/*"
}
],
"outputs": [
{
"name": "text",
"targetName": "myText"
},
{
"name": "layoutText",
"targetName": "myLayoutText"
}
],
}
]
}
skill_update_response = requests.put(f"{search_endpoint}/skillsets('{skill_name}')?api-version=2023-10-01-Preview",json=skill_data,headers=headers)
print("skill creation update code:", skill_update_response.status_code)
# print("skill creation update code:", skill_update_response.text)
indexer_name = 'ocr-indexer'
indexer_data = {
"name" : indexer_name,
"dataSourceName" : data_source_name,
"targetIndexName" : index_name,
"skillsetName":skill_name,
"parameters": {
"configuration": {
"indexedFileNameExtensions" : ".pdf,.docx,.txt,.png,.jpeg",
"dataToExtract": "contentAndMetadata",
"parsingMode": "default",
"imageAction": "generateNormalizedImages"
}
},
"schedule" : { },
"fieldMappings" : [
],
"outputFieldMappings": [
{
"sourceFieldName": "/document/normalized_images/*/myText",
"targetFieldName": "myText"
},
{
"sourceFieldName": "/document/normalized_images/*/myLayoutText",
"targetFieldName": "myLayoutText"
},
]
}
indexer_update_response = requests.put(f"{search_endpoint}/indexers/{indexer_name}?api-version=2020-06-30",json=indexer_data,headers=headers)
print("indexer update code: ",indexer_update_response.status_code)
indexer_reset_response = requests.post(f"{search_endpoint}/indexers/{indexer_name}/reset?api-version=2020-06-30",headers=headers)
print("indexer reset code: ",indexer_reset_response.status_code)
# time.sleep(3)
indexer_run_response = requests.post(f"{search_endpoint}/indexers/{indexer_name}/run?api-version=2020-06-30",headers=headers)
print("indexer rerun code: ",indexer_run_response.status_code)