JSON Embeddings not fully uploaded to Index
Hello,
I am trying to upload a JSON embedding file that has content and the embedding vectors to it. I can succesfully upload it but only the first content line is being added.
This is the part of the code to generate embeddings and write it to a JSON embedding file.Skärmbild 2023-08-16 015504.pngSkärmbild 2023-08-16 015529.png
EDIT: I have come to understand and confirmed that the upload to the index is only uploading the last portion of my JSON-file, regardless of the unique ID I set for the key identifier in the index. What could be the reason for it?
def Create_Search_Index_Client():
index_client = SearchIndexClient(
endpoint=service_endpoint, credential=credential)
fields = [
SimpleField(name="document_title", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),
SearchableField(name="decision_date", type=SearchFieldDataType.String),
SearchableField(name="unique_id", type=SearchFieldDataType.String, key=True),
SearchableField(name="chapter_title", type=SearchFieldDataType.String, filterable=True),
SearchableField(name="subchapter_title", type=SearchFieldDataType.String, filterable=True),
SearchableField(name="content_text", type=SearchFieldDataType.String, filterable=True),
SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config"),
SearchField(name="subchapter_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
searchable=True, vector_search_dimensions=1536, vector_search_configuration="my-vector-config"),
]
vector_search = VectorSearch(
algorithm_configurations=[
HnswVectorSearchAlgorithmConfiguration(
name="my-vector-config",
kind="hnsw",
parameters={
"m": 4,
"efConstruction": 400,
"efSearch": 500,
"metric": "cosine"
}
)
]
)
semantic_config = SemanticConfiguration(
name="my-semantic-config",
prioritized_fields=PrioritizedFields(
title_field=SemanticField(field_name="subchapter_title"),
prioritized_keywords_fields=[SemanticField(field_name="chapter_title")],
prioritized_content_fields=[SemanticField(field_name="content_text")]
)
)
# Create the semantic settings with the configuration
semantic_settings = SemanticSettings(configurations=[semantic_config])
# Create the search index with the semantic settings
index = SearchIndex(name=index_name, fields=fields,
vector_search=vector_search, semantic_settings=semantic_settings)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')
And this is the upload to the Index
def Upload_all_json_embeddings_files_to_index(embeddings_directory):
print(os.getcwd())
# Create a search client
search_client = SearchClient(endpoint=service_endpoint, index_name=index_name, credential=credential)
# Iterate over all files in the specified folder
total_uploaded = 0
for filename in os.listdir(embeddings_directory):
# Check if the file is a JSON file
if filename.endswith('_embeddings.json'):
json_path = os.path.join(embeddings_directory, filename)
# Read the .json file
with open(json_path, 'r', encoding='utf-8-sig') as file:
documents = json.load(file)
# Upload the documents to the Azure Search index
result = search_client.upload_documents(documents)
# Update the total number of uploaded documents
total_uploaded += len(documents)
print(f"Uploaded {total_uploaded} documents from {embeddings_directory}")