Python Streamlit Real-time Speech-to-Text with Azure SDK Issues

Raffaele Aldrigo 20 Reputation points
2025-03-19T16:12:36.9+00:00

A Python Streamlit app is being developed to allow live transcription using streamlit_webrtc and Azure Speech SDK. The existing implementation can save and play recorded audio from the web, but live transcription is not functioning as expected.

Here is a snippet of the code being used:

webrtc_ctx = webrtc_streamer(key="speech-to-text", mode=WebRtcMode.SENDONLY,
        media_stream_constraints={"video": False, "audio": True},
        audio_receiver_size=256)

while webrtc_ctx.state.playing:
    if not st.session_state["recording"]:
        st.session_state.r = []

        stream = PushAudioInputStream()
        ###
        audio_input = speechsdk.AudioConfig(stream=stream)
        speech_config = speechsdk.SpeechConfig(env["SPEECH_KEY"], env["SPEECH_REGION"])
        speech_config.speech_recognition_language = "it-IT"
        if "proxy_host" in env and "proxy_port" in env:
            speech_config.set_proxy(env["proxy_host"], int(env["proxy_port"]))
        conversation_transcriber = speechsdk.transcription.ConversationTranscriber(speech_config, audio_input)

        def addsentence(evt: ConversationTranscriptionEventArgs):
            if evt.result.speaker_id == "Unknown":
                logger.debug("Unknown speaker: " + str(evt))
                return
            logger.info(f"Detected **{evt.result.speaker_id}**: {evt.result.text}")
            st.session_state.r.append(f"**{evt.result.speaker_id}**: {evt.result.text}")

        conversation_transcriber.transcribed.connect(addsentence)
        ###

        st.session_state.fullwav = pydub.AudioSegment.empty()
        with (st.chat_message("assistant")):
            with st.spinner("Trascrizione in corso..."):
                stream_placeholder = st.expander("Trascrizione", icon="πŸ“").empty()

        conversation_transcriber.start_transcribing_async()
        logger.info("Transcribing started!")
        st.session_state["recording"] = True

    try:
        audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
    except queue.Empty:
        time.sleep(0.1)
        logger.debug("No frame arrived.")
        continue

    stream_placeholder.markdown("## Trascrizione:\n\n" + "\\\n".join(st.session_state.r))

    #sound_chunk = pydub.AudioSegment.empty()
    for audio_frame in audio_frames:
        sound = pydub.AudioSegment(
            data=audio_frame.to_ndarray().tobytes(),
            sample_width=audio_frame.format.bytes,
            frame_rate=audio_frame.sample_rate,
            channels=len(audio_frame.layout.channels),
        )
        #sound_chunk += sound
        st.session_state.fullwav += sound

    #if len(sound_chunk) >0:
        #stream.write(sound_chunk.get_array_of_samples())

if st.session_state["recording"]:
    logger.info("stopped listening")
    wav_file_path= tempfile.NamedTemporaryFile(suffix='.wav', delete=False).name
    st.session_state.fullwav.export(wav_file_path, format="wav")

Any insights or suggestions on what might be causing the issue with live transcription?

Azure AI Speech
Azure AI Speech
An Azure service that integrates speech processing into apps and services.
2,061 questions
{count} votes

Accepted answer
  1. JAYA SHANKAR G S 4,035 Reputation points Microsoft External Staff Moderator
    2025-03-28T12:01:44.8733333+00:00

    Hello @Raffaele Aldrigo ,

    Thanks for the update.

    Since you can not accept your own answer I am posting it as a solution please accept it so that it will help the community to find better solution.

    Issue: Live transcription using Azure speech SDK with streamlit_webrtc not working in Python Streamlit app.

    Solution:

    Updated code

    def transcribe_webrtc(self, webrtc_ctx: WebRtcStreamerContext) -> str:
            push_stream = PushAudioInputStream()
            audio_config = AudioConfig(stream=push_stream)
            transcriber = self.setup_transcriber(audio_config)
            transcriber.start_transcribing_async()
            logger.info("Started WebRTC transcription")
    
            try:
                while webrtc_ctx.state.playing:
                    audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
                    if not audio_frames:
                        logger.debug("No audio frames received")
                        continue
    
                    frame = pydub.AudioSegment.empty()
                    for audio_frame in audio_frames:
                        sound = pydub.AudioSegment(
                            data=audio_frame.to_ndarray().tobytes(),
                            sample_width=audio_frame.format.bytes,
                            frame_rate=audio_frame.sample_rate,
                            channels=len(audio_frame.layout.channels),
                        )
                        frame += sound
    
                    if len(frame) > 0:
                        logger.debug(f"Processing audio frame of length {len(frame.raw_data)} bytes")
                        frame= frame.set_channels(1).set_frame_rate(16000)
                        push_stream.write(frame.raw_data)
    
                    if self.on_transcribed:
                        self.on_transcribed("\\\n".join(self.results))
                    time.sleep(0.1)
    
    

    Thank you

    0 comments No comments

1 additional answer

Sort by: Most helpful
  1. Raffaele Aldrigo 20 Reputation points
    2025-03-28T11:07:25.4266667+00:00

    I've came up with a working solution

        def transcribe_webrtc(self, webrtc_ctx: WebRtcStreamerContext) -> str:
            push_stream = PushAudioInputStream()
            audio_config = AudioConfig(stream=push_stream)
            transcriber = self.setup_transcriber(audio_config)
            transcriber.start_transcribing_async()
            logger.info("Started WebRTC transcription")
    
            try:
                while webrtc_ctx.state.playing:
                    audio_frames = webrtc_ctx.audio_receiver.get_frames(timeout=1)
                    if not audio_frames:
                        logger.debug("No audio frames received")
                        continue
    
                    frame = pydub.AudioSegment.empty()
                    for audio_frame in audio_frames:
                        sound = pydub.AudioSegment(
                            data=audio_frame.to_ndarray().tobytes(),
                            sample_width=audio_frame.format.bytes,
                            frame_rate=audio_frame.sample_rate,
                            channels=len(audio_frame.layout.channels),
                        )
                        frame += sound
    
                    if len(frame) > 0:
                        logger.debug(f"Processing audio frame of length {len(frame.raw_data)} bytes")
                        frame= frame.set_channels(1).set_frame_rate(16000)
                        push_stream.write(frame.raw_data)
    
                    if self.on_transcribed:
                        self.on_transcribed("\\\n".join(self.results))
                    time.sleep(0.1)
    
    
    0 comments No comments

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.