How to get the [nnnn].word.json file when using batch synthesis for avatar text-to-speech?

d m 5 Reputation points
2024-09-13T16:42:42.83+00:00

I've tried to enable word bounday to get the time-stamped "script" when generating an avatar text-to-speech video.

Everything is running fine but the only outputs im getting are the video and the standard summary.json file


# Function to submit a batch synthesis job
def submit_synthesis(text, voice, character, style, output_container_url, video_crop):
    synthesis_id = str(uuid.uuid4())  # Generate a unique synthesis ID
    url = f'https://{SERVICE_REGION}.api.cognitive.microsoft.com/avatar/batchsyntheses/{synthesis_id}?api-version=2024-08-01'
    
    headers = {
        'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY,
        'Content-Type': 'application/json'
    }

    # Payload with avatarConfig for video generation
    payload = {
        "inputKind": "PlainText",
        "synthesisConfig": {
            "voice": voice  # Voice chosen by the user
        },
        "inputs": [
            {
                "content": text
            }
        ],
        "avatarConfig": {
            "talkingAvatarCharacter": character,  # Character chosen by user
            "talkingAvatarStyle": style,  # Style chosen by user
            "videoFormat": "webm",  # Specify video format
            "videoCodec": "vp9",  # Codec for video
            "subtitleType": "soft_embedded",  # Subtitles option
            "backgroundColor": "transparent",  # Background for avatar
            "videoCrop": video_crop  # Set the appropriate video crop based on the user's selection
        },
        "properties": {
            "destinationContainerUrl": output_container_url,  # Blob storage URL with SAS token
            "timeToLiveInHours": 744,  # How long to keep the job
            "wordBoundaryEnabled": True

        }
    }

    response = requests.put(url, headers=headers, json=payload)
    
    if response.status_code == 201:
        logger.info('Batch avatar synthesis job submitted successfully')
        job_id = response.json()["id"]
        return job_id
    else:
        logger.error(f'Failed to submit batch avatar synthesis job: {response.text}')
        return None

# Function to check the synthesis job status
def get_synthesis_status(job_id):
    url = f'https://{SERVICE_REGION}.api.cognitive.microsoft.com/avatar/batchsyntheses/{job_id}?api-version=2024-08-01'
    headers = {
        'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY
    }

    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        # Check and print the full response for debugging purposes
        job_status = response.json()
        logger.info(f'Job status response: {job_status}')
        return job_status
    else:
        logger.error(f'Failed to get batch synthesis job status: {response.text}')
        return None

# Streamlit interface
def main():
    st.title("Azure Talking Avatar Video Generator")
    
    # Language and voice selection
    lang = st.selectbox('Choose the language', list(lang_voices.keys()))
    voice = lang_voices[lang][1]
    
    # User input for text
    text_input = st.text_area(f'Type the text in {lang}:')

    # Character selection
    selected_character = st.selectbox('Choose the avatar character:', list(character_styles.keys()))
    
    # Dynamically update styles based on character selection
    available_styles = character_styles[selected_character]
    selected_style = st.selectbox('Choose the avatar style:', available_styles)
    
    output_container_url = st.text_input("Enter Output Container URL (with SAS Token)")

    # Add an option to select avatar view (Full Body, Top Half, Head)
    avatar_view = st.selectbox('Choose avatar view', ["Full Body", "Top Half", "Head"], index=0)

    # Adjust video cropping based on avatar view selection
    if avatar_view == "Full Body":
        video_crop = {
            "topLeft": {"x": 0, "y": 0},
            "bottomRight": {"x": 1920, "y": 1080}
        }
    elif avatar_view == "Top Half":
        video_crop = {
            "topLeft": {"x": 0, "y": 0},
            "bottomRight": {"x": 1920, "y": 540}
        }
    else:  # Head
        video_crop = {
            "topLeft": {"x": 700, "y": 0},
            "bottomRight": {"x": 1220, "y": 500}
        }

    submit_button = st.button("Submit Job")
    
    if submit_button:
        if text_input.strip() and output_container_url.strip():
            # Check if selected style is valid for the character
            if selected_style in character_styles[selected_character]:
                with st.spinner("Processing..."):
                    job_id = submit_synthesis(text_input, voice, selected_character, selected_style, output_container_url, video_crop)
                    if job_id:
                        st.success(f'Job submitted! Job ID: {job_id}')
                        st.info("Checking job status. Please wait...")
                        while True:
                            job_status = get_synthesis_status(job_id)
                            
                            # Check for a valid response and process it
                            if job_status and job_status["status"] == "Succeeded":
                                st.success("Avatar video generated successfully!")
                                
                                # Log the full response to ensure correct output
                                logger.info(f'Full job status response: {job_status}')
                                
                                # Extract video URL from the response
                                destination_container_url = job_status['properties'].get('destinationContainerUrl', '')
                                relative_video_path = job_status['outputs'].get('result', '')

                                # Construct the full video URL
                                if destination_container_url and relative_video_path:
                                    # Ensure no double slashes in the final URL
                                    if destination_container_url.endswith('/'):
                                        destination_container_url = destination_container_url[:-1]
                                    
                                    video_url = f"{destination_container_url}/{relative_video_path}"
                                    
                                    # If the video URL is valid, display the video
                                    if video_url.startswith("http"):
                                        st.markdown(f"Download the avatar video [here]({video_url})")
                                        st.video(video_url)
                                    else:
                                        st.error("Video URL seems invalid or incomplete.")
                                else:
                                    st.error("Video URL seems invalid or incomplete.")
                                break
                            
                            elif job_status and job_status["status"] == "Failed":
                                st.error("Failed to generate the avatar video.")
                                break
                            else:
                                time.sleep(5)  # Wait for a few seconds before checking again
            else:
                # Show a warning if the style is not compatible with the selected character
                st.warning(f'The character "{selected_character}" cannot use the chosen style. '
                        f'Please choose from the available styles: {", ".join(available_styles)}')
        else:
            st.error("Please provide the text and output container URL.")

Azure AI Speech
Azure AI Speech
An Azure service that integrates speech processing into apps and services.
2,069 questions
{count} votes

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.