How to get the [nnnn].word.json file when using batch synthesis for avatar text-to-speech?

Question

How to get the [nnnn].word.json file when using batch synthesis for avatar text-to-speech?

d m 5

I've tried to enable word bounday to get the time-stamped "script" when generating an avatar text-to-speech video.

Everything is running fine but the only outputs im getting are the video and the standard summary.json file


# Function to submit a batch synthesis job
def submit_synthesis(text, voice, character, style, output_container_url, video_crop):
    synthesis_id = str(uuid.uuid4())  # Generate a unique synthesis ID
    url = f'https://{SERVICE_REGION}.api.cognitive.microsoft.com/avatar/batchsyntheses/{synthesis_id}?api-version=2024-08-01'
    
    headers = {
        'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY,
        'Content-Type': 'application/json'
    }

    # Payload with avatarConfig for video generation
    payload = {
        "inputKind": "PlainText",
        "synthesisConfig": {
            "voice": voice  # Voice chosen by the user
        },
        "inputs": [
            {
                "content": text
            }
        ],
        "avatarConfig": {
            "talkingAvatarCharacter": character,  # Character chosen by user
            "talkingAvatarStyle": style,  # Style chosen by user
            "videoFormat": "webm",  # Specify video format
            "videoCodec": "vp9",  # Codec for video
            "subtitleType": "soft_embedded",  # Subtitles option
            "backgroundColor": "transparent",  # Background for avatar
            "videoCrop": video_crop  # Set the appropriate video crop based on the user's selection
        },
        "properties": {
            "destinationContainerUrl": output_container_url,  # Blob storage URL with SAS token
            "timeToLiveInHours": 744,  # How long to keep the job
            "wordBoundaryEnabled": True

        }
    }

    response = requests.put(url, headers=headers, json=payload)
    
    if response.status_code == 201:
        logger.info('Batch avatar synthesis job submitted successfully')
        job_id = response.json()["id"]
        return job_id
    else:
        logger.error(f'Failed to submit batch avatar synthesis job: {response.text}')
        return None

# Function to check the synthesis job status
def get_synthesis_status(job_id):
    url = f'https://{SERVICE_REGION}.api.cognitive.microsoft.com/avatar/batchsyntheses/{job_id}?api-version=2024-08-01'
    headers = {
        'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY
    }

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        # Check and print the full response for debugging purposes
        job_status = response.json()
        logger.info(f'Job status response: {job_status}')
        return job_status
    else:
        logger.error(f'Failed to get batch synthesis job status: {response.text}')
        return None

# Streamlit interface
def main():
    st.title("Azure Talking Avatar Video Generator")
    
    # Language and voice selection
    lang = st.selectbox('Choose the language', list(lang_voices.keys()))
    voice = lang_voices[lang][1]
    
    # User input for text
    text_input = st.text_area(f'Type the text in {lang}:')

    # Character selection
    selected_character = st.selectbox('Choose the avatar character:', list(character_styles.keys()))
    
    # Dynamically update styles based on character selection
    available_styles = character_styles[selected_character]
    selected_style = st.selectbox('Choose the avatar style:', available_styles)
    
    output_container_url = st.text_input("Enter Output Container URL (with SAS Token)")

    # Add an option to select avatar view (Full Body, Top Half, Head)
    avatar_view = st.selectbox('Choose avatar view', ["Full Body", "Top Half", "Head"], index=0)

    # Adjust video cropping based on avatar view selection
    if avatar_view == "Full Body":
        video_crop = {
            "topLeft": {"x": 0, "y": 0},
            "bottomRight": {"x": 1920, "y": 1080}
        }
    elif avatar_view == "Top Half":
        video_crop = {
            "topLeft": {"x": 0, "y": 0},
            "bottomRight": {"x": 1920, "y": 540}
        }
    else:  # Head
        video_crop = {
            "topLeft": {"x": 700, "y": 0},
            "bottomRight": {"x": 1220, "y": 500}
        }

    submit_button = st.button("Submit Job")
    
    if submit_button:
        if text_input.strip() and output_container_url.strip():
            # Check if selected style is valid for the character
            if selected_style in character_styles[selected_character]:
                with st.spinner("Processing..."):
                    job_id = submit_synthesis(text_input, voice, selected_character, selected_style, output_container_url, video_crop)
                    if job_id:
                        st.success(f'Job submitted! Job ID: {job_id}')
                        st.info("Checking job status. Please wait...")
                        while True:
                            job_status = get_synthesis_status(job_id)
                            
                            # Check for a valid response and process it
                            if job_status and job_status["status"] == "Succeeded":
                                st.success("Avatar video generated successfully!")
                                
                                # Log the full response to ensure correct output
                                logger.info(f'Full job status response: {job_status}')
                                
                                # Extract video URL from the response
                                destination_container_url = job_status['properties'].get('destinationContainerUrl', '')
                                relative_video_path = job_status['outputs'].get('result', '')

                                # Construct the full video URL
                                if destination_container_url and relative_video_path:
                                    # Ensure no double slashes in the final URL
                                    if destination_container_url.endswith('/'):
                                        destination_container_url = destination_container_url[:-1]
                                    
                                    video_url = f"{destination_container_url}/{relative_video_path}"
                                    
                                    # If the video URL is valid, display the video
                                    if video_url.startswith("http"):
                                        st.markdown(f"Download the avatar video [here]({video_url})")
                                        st.video(video_url)
                                    else:
                                        st.error("Video URL seems invalid or incomplete.")
                                else:
                                    st.error("Video URL seems invalid or incomplete.")
                                break
                            
                            elif job_status and job_status["status"] == "Failed":
                                st.error("Failed to generate the avatar video.")
                                break
                            else:
                                time.sleep(5)  # Wait for a few seconds before checking again
            else:
                # Show a warning if the style is not compatible with the selected character
                st.warning(f'The character "{selected_character}" cannot use the chosen style. '
                        f'Please choose from the available styles: {", ".join(available_styles)}')
        else:
            st.error("Please provide the text and output container URL.")

YutongTie-MSFT 53,971 Reputation points Moderator

2024-09-15T08:14:24.17+00:00

d m Thanks for reaching out to us, could you please confirm that you are actually looking for a time stamps for words, but your code only works for summary.json? Could you please share the document you are referring to if there is one so that we can reproduce it?

d m 5

@YutongTie-MSFT thank you for the answer. When i say I'm not getting the [nnnn].word.json file I'm refering this:
"""
If sentence boundary data was requested ("sentenceBoundaryEnabled": true), then a corresponding [nnnn].sentence.json file is included in the results. Likewise, if word boundary data was requested ("wordBoundaryEnabled": true), then a corresponding [nnnn].word.json file is included in the results.

Here's an example word data file with both audio offset and duration in milliseconds:

JSONCopy

[   {     "Text": "The",     "AudioOffset": 50,     "Duration": 137   },   {     "Text": "rainbow",     "AudioOffset": 200,     "Duration": 350   },   {     "Text": "has",     "AudioOffset": 562,     "Duration": 175   },   {     "Text": "seven",     "AudioOffset": 750,     "Duration": 300   },   {     "Text": "colors",     "AudioOffset": 1062,     "Duration": 625   },   {     "Text": ".",     "AudioOffset": 1700,     "Duration": 100   } ]

"""
which I found here: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-synthesis#batch-synthesis-results

As you can see in my code, I included the "wordBoundaryEnabled"= True in the payload properties, yet I'm not getting the "[nnnn].word.json" file, just the video and the summary.json file, which is just this:
"

{
  "jobID": "9efb93a4-0f1d-4a02-8374-74cf640b9f16",
  "status": "Succeeded",
  "results": [
    {
      "contents": [
        "A recente onda de calor em Portugal levou a um aumento significativo de alertas de incêndio em várias regiões do país. O Instituto Português do Mar e da Atmosfera (IPMA) emitiu avisos vermelhos para os distritos de Castelo Branco, Santarém e Portalegre, devido às temperaturas que ultrapassaram os 40 graus Celsius. "
      ],
      "status": "Succeeded",
      "videoFileName": "4bbb121d760b4701bbf01aa7b4fdbea7/ceaa743a-db15-4f29-ad36-cb2627fc3f58/0001.webm",
      "TalkingAvatarCharacter": "max",
      "TalkingAvatarStyle": "formal"
    }
  ]
}

"

Your answer

YutongTie-MSFT 53,971 Reputation points Moderator

2024-09-15T08:14:24.17+00:00

d m Thanks for reaching out to us, could you please confirm that you are actually looking for a time stamps for words, but your code only works for summary.json? Could you please share the document you are referring to if there is one so that we can reproduce it?
d m 5 Reputation points

2024-09-16T13:47:09.99+00:00

@YutongTie-MSFT thank you for the answer. When i say I'm not getting the [nnnn].word.json file I'm refering this:
"""
If sentence boundary data was requested ("sentenceBoundaryEnabled": true), then a corresponding [nnnn].sentence.json file is included in the results. Likewise, if word boundary data was requested ("wordBoundaryEnabled": true), then a corresponding [nnnn].word.json file is included in the results.

Here's an example word data file with both audio offset and duration in milliseconds:

JSONCopy

[ { "Text": "The", "AudioOffset": 50, "Duration": 137 }, { "Text": "rainbow", "AudioOffset": 200, "Duration": 350 }, { "Text": "has", "AudioOffset": 562, "Duration": 175 }, { "Text": "seven", "AudioOffset": 750, "Duration": 300 }, { "Text": "colors", "AudioOffset": 1062, "Duration": 625 }, { "Text": ".", "AudioOffset": 1700, "Duration": 100 } ]

"""
which I found here: https://learn.microsoft.com/en-us/azure/ai-services/speech-service/batch-synthesis#batch-synthesis-results

As you can see in my code, I included the "wordBoundaryEnabled"= True in the payload properties, yet I'm not getting the "[nnnn].word.json" file, just the video and the summary.json file, which is just this:
"

{ "jobID": "9efb93a4-0f1d-4a02-8374-74cf640b9f16", "status": "Succeeded", "results": [ { "contents": [ "A recente onda de calor em Portugal levou a um aumento significativo de alertas de incêndio em várias regiões do país. O Instituto Português do Mar e da Atmosfera (IPMA) emitiu avisos vermelhos para os distritos de Castelo Branco, Santarém e Portalegre, devido às temperaturas que ultrapassaram os 40 graus Celsius. " ], "status": "Succeeded", "videoFileName": "4bbb121d760b4701bbf01aa7b4fdbea7/ceaa743a-db15-4f29-ad36-cb2627fc3f58/0001.webm", "TalkingAvatarCharacter": "max", "TalkingAvatarStyle": "formal" } ] }

"

Share via

How to get the [nnnn].word.json file when using batch synthesis for avatar text-to-speech?

Your answer