Stream azure tts audio byte output directly to cloud storage?

LeetGPT 95 Reputation points
2024-02-14T05:40:57.7333333+00:00
def speech_synthesis_to_push_audio_output_stream():
    """performs speech synthesis and push audio output to a stream"""
    class PushAudioOutputStreamSampleCallback(speechsdk.audio.PushAudioOutputStreamCallback):
        """
        Example class that implements the PushAudioOutputStreamCallback, which is used to show
        how to push output audio to a stream
        """
        def __init__(self) -> None:
            super().__init__()
            self._audio_data = bytes(0)
            self._closed = False
        def write(self, audio_buffer: memoryview) -> int:
            """
            The callback function which is invoked when the synthesizer has an output audio chunk
            to write out
            """
            self._audio_data += audio_buffer
            print("{} bytes received.".format(audio_buffer.nbytes))
            return audio_buffer.nbytes
        def close(self) -> None:
            """
            The callback function which is invoked when the synthesizer is about to close the
            stream.
            """
            self._closed = True
            print("Push audio output stream closed.")
        def get_audio_data(self) -> bytes:
            return self._audio_data
        def get_audio_size(self) -> int:
            return len(self._audio_data)
    # Creates an instance of a speech config with specified subscription key and service region.
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
    # Creates customized instance of PushAudioOutputStreamCallback
    stream_callback = PushAudioOutputStreamSampleCallback()
    # Creates audio output stream from the callback
    push_stream = speechsdk.audio.PushAudioOutputStream(stream_callback)
    # Creates a speech synthesizer using push stream as audio output.
    stream_config = speechsdk.audio.AudioOutputConfig(stream=push_stream)
    speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=stream_config)
    # Receives a text from console input and synthesizes it to stream output.
    while True:
        print("Enter some text that you want to synthesize, Ctrl-Z to exit")
        try:
            text = input()
        except EOFError:
            break
        result = speech_synthesizer.speak_text_async(text).get()
        # Check result
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print("Speech synthesized for text [{}], and the audio was written to output stream.".format(text))
        elif result.reason == speechsdk.ResultReason.Canceled:
            cancellation_details = result.cancellation_details
            print("Speech synthesis canceled: {}".format(cancellation_details.reason))
            if cancellation_details.reason == speechsdk.CancellationReason.Error:
                print("Error details: {}".format(cancellation_details.error_details))
        # Destroys result which is necessary for destroying speech synthesizer
        del result
    # Destroys the synthesizer in order to close the output stream.
    del speech_synthesizer
    print("Totally {} bytes received.".format(stream_callback.get_audio_size()))


Hi team, I was following this sample code for utilizing output stream, however when I upload audio bytes directly onto cloud storage, the audio itself cannot be played. I ended up saving the audio file locally first with proper encoding and then save to cloud storage. However, this create a bit I/O overhead and extra latency. Is there a way we can stream upload audio bytes to cloud storage without saving it to local file with proper encoding first? Thank you so much!!

Azure AI Speech
Azure AI Speech
An Azure service that integrates speech processing into apps and services.
2,061 questions
0 comments No comments
{count} votes

Accepted answer
  1. dupammi 8,615 Reputation points Microsoft External Staff
    2024-02-14T09:21:59.7566667+00:00

    Hi @LeetGPT

    Thank you for reaching out to the Microsoft Q&A forum and for providing your code snippet.

    I understand that you're looking for a more streamlined approach to stream the Azure Text-to-Speech (TTS) audio output directly to cloud storage without saving it locally first.

    While your current implementation effectively synthesizes speech and uploads it to Azure Blob Storage, it involves an intermediate step of saving the audio file locally before uploading it to the cloud storage, which introduces additional I/O overhead and latency.

    To achieve a more direct streaming of audio data to cloud storage without saving it locally, you can leverage Azure Blob Storage's ability to accept byte data directly. Below, I've outlined a modified approach that worked for me.

    import os
    import azure.cognitiveservices.speech as speechsdk
    from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
    # Azure Speech Service Configuration
    speech_key = "YOUR_SPEECH_KEY"
    service_region = "YOUR_SERVICE_REGION"
    from_language = 'en-US'
    
    # Azure Storage Configuration
    storage_connection_string = "YOUR_STORAGE_CONNECTION_STRING"
    container_name = 'YOUR_CONTAINER_NAME'
    def synthesize_text_to_speech():
        # Azure Speech Service Configuration
        speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
        speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config)
        # Text to be synthesized
        text = "Hello, how are you?"
        # Synthesize speech
        result = speech_synthesizer.speak_text_async(text).get()
        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            print("Speech synthesized successfully.")
            audio_data = result.audio_data
            upload_audio_to_storage(audio_data)
        else:
            print("Failed to synthesize speech:", result.reason)
    def upload_audio_to_storage(audio_data):
        blob_service_client = BlobServiceClient.from_connection_string(storage_connection_string)
        container_client = blob_service_client.get_container_client(container_name)
        blob_name = 'synthesized_audio.wav'
        blob_client = container_client.get_blob_client(blob_name)
        # Upload audio data to Azure Blob Storage
        blob_client.upload_blob(audio_data, overwrite=True)
        print("Audio uploaded to Azure Blob Storage.")
    synthesize_text_to_speech()
    

    This modified approach directly uploads the audio data obtained from the Azure Text-to-Speech service to Azure Blob Storage without the need for saving it locally first. By utilizing the upload_blob method directly with the audio data, you can effectively streamline the process and reduce unnecessary I/O operations and latency.

    Please ensure you replace the placeholder values (e.g., "YOUR_SPEECH_KEY", "YOUR_SERVICE_REGION", "YOUR_STORAGE_CONNECTION_STRING") with your actual Azure Speech service key, service region, and storage connection string respectively.

    Blob Upload result.
    User's imageHope you understand. Thank you.


    If this answers your query, do click Accept Answer and Yes for was this answer helpful.


0 additional answers

Sort by: Most helpful

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.