How to get the [nnnn].word.json file when using batch synthesis for avatar text-to-speech?
d m
5
Reputation points
I've tried to enable word bounday to get the time-stamped "script" when generating an avatar text-to-speech video.
Everything is running fine but the only outputs im getting are the video and the standard summary.json file
# Function to submit a batch synthesis job
def submit_synthesis(text, voice, character, style, output_container_url, video_crop):
synthesis_id = str(uuid.uuid4()) # Generate a unique synthesis ID
url = f'https://{SERVICE_REGION}.api.cognitive.microsoft.com/avatar/batchsyntheses/{synthesis_id}?api-version=2024-08-01'
headers = {
'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY,
'Content-Type': 'application/json'
}
# Payload with avatarConfig for video generation
payload = {
"inputKind": "PlainText",
"synthesisConfig": {
"voice": voice # Voice chosen by the user
},
"inputs": [
{
"content": text
}
],
"avatarConfig": {
"talkingAvatarCharacter": character, # Character chosen by user
"talkingAvatarStyle": style, # Style chosen by user
"videoFormat": "webm", # Specify video format
"videoCodec": "vp9", # Codec for video
"subtitleType": "soft_embedded", # Subtitles option
"backgroundColor": "transparent", # Background for avatar
"videoCrop": video_crop # Set the appropriate video crop based on the user's selection
},
"properties": {
"destinationContainerUrl": output_container_url, # Blob storage URL with SAS token
"timeToLiveInHours": 744, # How long to keep the job
"wordBoundaryEnabled": True
}
}
response = requests.put(url, headers=headers, json=payload)
if response.status_code == 201:
logger.info('Batch avatar synthesis job submitted successfully')
job_id = response.json()["id"]
return job_id
else:
logger.error(f'Failed to submit batch avatar synthesis job: {response.text}')
return None
# Function to check the synthesis job status
def get_synthesis_status(job_id):
url = f'https://{SERVICE_REGION}.api.cognitive.microsoft.com/avatar/batchsyntheses/{job_id}?api-version=2024-08-01'
headers = {
'Ocp-Apim-Subscription-Key': SUBSCRIPTION_KEY
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
# Check and print the full response for debugging purposes
job_status = response.json()
logger.info(f'Job status response: {job_status}')
return job_status
else:
logger.error(f'Failed to get batch synthesis job status: {response.text}')
return None
# Streamlit interface
def main():
st.title("Azure Talking Avatar Video Generator")
# Language and voice selection
lang = st.selectbox('Choose the language', list(lang_voices.keys()))
voice = lang_voices[lang][1]
# User input for text
text_input = st.text_area(f'Type the text in {lang}:')
# Character selection
selected_character = st.selectbox('Choose the avatar character:', list(character_styles.keys()))
# Dynamically update styles based on character selection
available_styles = character_styles[selected_character]
selected_style = st.selectbox('Choose the avatar style:', available_styles)
output_container_url = st.text_input("Enter Output Container URL (with SAS Token)")
# Add an option to select avatar view (Full Body, Top Half, Head)
avatar_view = st.selectbox('Choose avatar view', ["Full Body", "Top Half", "Head"], index=0)
# Adjust video cropping based on avatar view selection
if avatar_view == "Full Body":
video_crop = {
"topLeft": {"x": 0, "y": 0},
"bottomRight": {"x": 1920, "y": 1080}
}
elif avatar_view == "Top Half":
video_crop = {
"topLeft": {"x": 0, "y": 0},
"bottomRight": {"x": 1920, "y": 540}
}
else: # Head
video_crop = {
"topLeft": {"x": 700, "y": 0},
"bottomRight": {"x": 1220, "y": 500}
}
submit_button = st.button("Submit Job")
if submit_button:
if text_input.strip() and output_container_url.strip():
# Check if selected style is valid for the character
if selected_style in character_styles[selected_character]:
with st.spinner("Processing..."):
job_id = submit_synthesis(text_input, voice, selected_character, selected_style, output_container_url, video_crop)
if job_id:
st.success(f'Job submitted! Job ID: {job_id}')
st.info("Checking job status. Please wait...")
while True:
job_status = get_synthesis_status(job_id)
# Check for a valid response and process it
if job_status and job_status["status"] == "Succeeded":
st.success("Avatar video generated successfully!")
# Log the full response to ensure correct output
logger.info(f'Full job status response: {job_status}')
# Extract video URL from the response
destination_container_url = job_status['properties'].get('destinationContainerUrl', '')
relative_video_path = job_status['outputs'].get('result', '')
# Construct the full video URL
if destination_container_url and relative_video_path:
# Ensure no double slashes in the final URL
if destination_container_url.endswith('/'):
destination_container_url = destination_container_url[:-1]
video_url = f"{destination_container_url}/{relative_video_path}"
# If the video URL is valid, display the video
if video_url.startswith("http"):
st.markdown(f"Download the avatar video [here]({video_url})")
st.video(video_url)
else:
st.error("Video URL seems invalid or incomplete.")
else:
st.error("Video URL seems invalid or incomplete.")
break
elif job_status and job_status["status"] == "Failed":
st.error("Failed to generate the avatar video.")
break
else:
time.sleep(5) # Wait for a few seconds before checking again
else:
# Show a warning if the style is not compatible with the selected character
st.warning(f'The character "{selected_character}" cannot use the chosen style. '
f'Please choose from the available styles: {", ".join(available_styles)}')
else:
st.error("Please provide the text and output container URL.")
Azure AI Speech
Azure AI Speech
An Azure service that integrates speech processing into apps and services.
2,069 questions
Sign in to answer