In Azure how to use both Text Split Skill and DocumentExtractionSkill?

Vivek 10 Reputation points
2024-07-19T10:30:58.6933333+00:00

I'm trying to build a tool using Azure OpenAI for private data. There are a lot of documents to be ingested and indexed. I'm wondering whether both DocumentExtractionSkill and Split Skill are required to read and then split text into smaller chunks before using embedding model? Does using DocumentExtractionSkill enhance the quality of extraction of content from documents or is the use of just the Split Skill sufficient for content extraction? I'm using a modified version of integrated vectorization sample code ( https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/integrated-vectorization/azure-search-integrated-vectorization-sample.ipynb ) which doesn't make use of DocumentExtractionSkill. So, I'm wondering whether DocumentExtractionSkill is necessary to read different types of documents and if yes, how to write Python code that makes use of both DocumentExtractionSkill and Split Skill in Python? My code sample is below. If I go to the Indexes section in the Azure AI Services instance, the document count , vector index size and total storage size of the index are identical for indexer run with or without the DocumentExtractionSkill. What might be wrong below?

 from azure.search.documents.indexes.models import (   
	SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    OcrSkill,
    MergeSkill,
    SearchIndexerIndexProjections,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    SearchIndexerSkillset,
    CognitiveServicesAccountKey,
    DocumentExtractionSkill
)
# Create a skillset  
skillset_name = f"{index_name}-skillset"
doc_extract_skill = DocumentExtractionSkill(
    name="documentExtractionSkill",
    description="Extract text from different types of documents",
    context="/document",
    inputs=[InputFieldMappingEntry(name="file_data", source="/document/file_data")],
    outputs=[OutputFieldMappingEntry(name="content", target_name="/document/content")]
)
ocr_skill = OcrSkill(
    description="OCR skill to scan PDFs and other images with text",
    context="/document/normalized_images/*",
    line_ending="Space",
    default_language_code="en",
    should_detect_orientation=True,
    inputs=[
        InputFieldMappingEntry(name="image", source="/document/normalized_images/*")
    ],
    outputs=[
        OutputFieldMappingEntry(name="text", target_name="text"),
        OutputFieldMappingEntry(name="layoutText", target_name="layoutText")
    ]
)
merge_skill = MergeSkill(
    description="Merge skill for combining OCR'd and regular text",
    context="/document",
    inputs=[
        InputFieldMappingEntry(name="text", source="/document/content"),
        InputFieldMappingEntry(name="itemsToInsert", source="/document/normalized_images/*/text"),
        InputFieldMappingEntry(name="offsets", source="/document/normalized_images/*/contentOffset")
    ],
    outputs=[
        OutputFieldMappingEntry(name="mergedText", target_name="merged_content")
    ]
)
# If an AI Services key is provided, use the OCR text as the source text for chunking
# Otherwise, use the normal document content.
split_skill_text_source = "/document/content" if not use_ocr else "/document/merged_content"
split_skill = SplitSkill(  
    description="Split skill to chunk documents",  
    text_split_mode="pages",  
    context="/document",  
    maximum_page_length=2000,  
    page_overlap_length=500,  
    inputs=[  
        InputFieldMappingEntry(name="text", source=split_skill_text_source),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="textItems", target_name="pages")  
    ],  
)  
  
embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate embeddings via Azure OpenAI",  
    context="/document/pages/*",  
    resource_uri=azure_openai_endpoint,  
    deployment_id=azure_openai_embedding_deployment,  
    model_name=azure_openai_model_name,
    dimensions=azure_openai_model_dimensions,
    api_key=azure_openai_key,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="embedding", target_name="vector")  
    ],  
)  
  
index_projections = SearchIndexerIndexProjections(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="parent_id",  
            source_context="/document/pages/*",  
            mappings=[  
                InputFieldMappingEntry(name="chunk", source="/document/pages/*"),  
                InputFieldMappingEntry(name="vector", source="/document/pages/*/vector"),  
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),  
            ],  
        ),  
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
) 
cognitive_services_account = CognitiveServicesAccountKey(key=azure_ai_services_key) if use_ocr else None
skills = [doc_extract_skill, split_skill, embedding_skill]
if use_ocr:
    skills.extend([ocr_skill, merge_skill])
skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    skills=skills,  
    index_projections=index_projections,
    cognitive_services_account=cognitive_services_account
)
  
client = SearchIndexerClient(endpoint, credential)  
client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created")  
Azure AI Search
Azure AI Search
An Azure search service with built-in artificial intelligence capabilities that enrich information to help identify and explore relevant content at scale.
1,062 questions
Azure OpenAI Service
Azure OpenAI Service
An Azure service that provides access to OpenAI’s GPT-3 models with enterprise capabilities.
3,227 questions
0 comments No comments
{count} votes

1 answer

Sort by: Most helpful
  1. brtrach-MSFT 16,586 Reputation points Microsoft Employee
    2024-07-22T03:52:59.72+00:00

    @Vivek To address your questions:

    1. DocumentExtractionSkill vs. Split Skill:
      • DocumentExtractionSkill is used to extract text from various document types (e.g., PDFs, Word documents). It ensures that the content is accurately extracted from different formats.
      • Split Skill is used to break down the extracted text into smaller chunks, which is useful for processing and embedding.
    2. Enhancing Quality:
      • Using DocumentExtractionSkill can enhance the quality of content extraction, especially when dealing with diverse document formats. It ensures that the text is correctly extracted before any further processing.
      • If your documents are already in a text-friendly format (e.g., plain text), you might not need DocumentExtractionSkill. However, for PDFs, scanned images, or complex formats, it is beneficial.

    Python Code Integration:

    • Your code sample looks mostly correct. However, ensure that the context and input/output mappings are correctly set up for each skill.
      • The document count, vector index size, and total storage size being identical might indicate that the DocumentExtractionSkill is not correctly configured or not being triggered. Double-check the skill configuration and ensure that the documents being indexed require text extraction.

    Here’s a refined version of your code with some adjustments:

    from azure.search.documents.indexes.models import (
        SplitSkill,
        InputFieldMappingEntry,
        OutputFieldMappingEntry,
        AzureOpenAIEmbeddingSkill,
        OcrSkill,
        MergeSkill,
        SearchIndexerIndexProjections,
        SearchIndexerIndexProjectionSelector,
        SearchIndexerIndexProjectionsParameters,
        IndexProjectionMode,
        SearchIndexerSkillset,
        CognitiveServicesAccountKey,
        DocumentExtractionSkill
    )
    
    # Create a skillset
    skillset_name = f"{index_name}-skillset"
    doc_extract_skill = DocumentExtractionSkill(
        name="documentExtractionSkill",
        description="Extract text from different types of documents",
        context="/document",
        inputs=[InputFieldMappingEntry(name="file_data", source="/document/file_data")],
        outputs=[OutputFieldMappingEntry(name="content", target_name="/document/content")]
    )
    
    ocr_skill = OcrSkill(
        description="OCR skill to scan PDFs and other images with text",
        context="/document/normalized_images/*",
        line_ending="Space",
        default_language_code="en",
        should_detect_orientation=True,
        inputs=[
            InputFieldMappingEntry(name="image", source="/document/normalized_images/*")
        ],
        outputs=[
            OutputFieldMappingEntry(name="text", target_name="text"),
            OutputFieldMappingEntry(name="layoutText", target_name="layoutText")
        ]
    )
    
    merge_skill = MergeSkill(
        description="Merge skill for combining OCR'd and regular text",
        context="/document",
        inputs=[
            InputFieldMappingEntry(name="text", source="/document/content"),
            InputFieldMappingEntry(name="itemsToInsert", source="/document/normalized_images/*/text"),
            InputFieldMappingEntry(name="offsets", source="/document/normalized_images/*/contentOffset")
        ],
        outputs=[
            OutputFieldMappingEntry(name="mergedText", target_name="merged_content")
        ]
    )
    
    split_skill_text_source = "/document/content" if not use_ocr else "/document/merged_content"
    split_skill = SplitSkill(
        description="Split skill to chunk documents",
        text_split_mode="pages",
        context="/document",
        maximum_page_length=2000,
        page_overlap_length=500,
        inputs=[
            InputFieldMappingEntry(name="text", source=split_skill_text_source),
        ],
        outputs=[
            OutputFieldMappingEntry(name="textItems", target_name="pages")
        ],
    )
    
    embedding_skill = AzureOpenAIEmbeddingSkill(
        description="Skill to generate embeddings via Azure OpenAI",
        context="/document/pages/*",
        resource_uri=azure_openai_endpoint,
        deployment_id=azure_openai_embedding_deployment,
        model_name=azure_openai_model_name,
        dimensions=azure_openai_model_dimensions,
        api_key=azure_openai_key,
        inputs=[
            InputFieldMappingEntry(name="text", source="/document/pages/*"),
        ],
        outputs=[
            OutputFieldMappingEntry(name="embedding", target_name="vector")
        ],
    )
    
    index_projections = SearchIndexerIndexProjections(
        selectors=[
            SearchIndexerIndexProjectionSelector(
                target_index_name=index_name,
                parent_key_field_name="parent_id",
                source_context="/document/pages/*",
                mappings=[
                    InputFieldMappingEntry(name="chunk", source="/document/pages/*"),
                    InputFieldMappingEntry(name="vector", source="/document/pages/*/vector"),
                    InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),
                ],
            ),
        ],
        parameters=SearchIndexerIndexProjectionsParameters(
            projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS
        ),
    )
    
    cognitive_services_account = CognitiveServicesAccountKey(key=azure_ai_services_key) if use_ocr else None
    skills = [doc_extract_skill, split_skill, embedding_skill]
    if use_ocr:
        skills.extend([ocr_skill, merge_skill])
    
    skillset = SearchIndexerSkillset(
        name=skillset_name,
        description="Skillset to chunk documents and generating embeddings",
        skills=skills,
        index_projections=index_projections,
        cognitive_services_account=cognitive_services_account
    )
    
    client = SearchIndexerClient(endpoint, credential)
    client.create_or_update_skillset(skillset)
    print(f"{skillset.name} created")
    
    
    

    Make sure to test the skillset with a variety of document types to ensure the DocumentExtractionSkill is functioning as expected. If you continue to see identical index sizes, it might be worth checking the logs or diagnostics to see if the skill is being applied correctly.

    0 comments No comments

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.