The Azure Cognitive Services Speech is repeating the audio twice

Question

The Azure Cognitive Services Speech is repeating the audio twice

Vijay Sheth 0

While I am using the Azure Cognitive Services Speech is repeating the audio twice ,

santoshkc 15,435 Reputation points Microsoft External Staff Moderator

2024-06-19T12:04:45.4466667+00:00

Hi @Vijay Sheth,

Did you got any chance to check the below response?
Vijay Sheth 0 Reputation points

2024-06-19T12:15:07.22+00:00

No , the issues was not resolved I have posted the code below, please see and see if it there is an issues

3 answers

Your answer

santoshkc 15,435 Reputation points Microsoft External Staff Moderator

2024-06-19T12:04:45.4466667+00:00

Hi @Vijay Sheth,

Did you got any chance to check the below response?
Vijay Sheth 0 Reputation points

2024-06-19T12:15:07.22+00:00

No , the issues was not resolved I have posted the code below, please see and see if it there is an issues

Answer 1

santoshkc 15,435 Microsoft External Staff Moderator

Hi @Vijay Sheth,

Thank you for reaching out to Microsoft Q&A forum!

I understand that you are experiencing an issue with the Azure Speech service. Before we proceed, may I confirm if you are using the Azure Text-to-Speech service to generate audio with the given text? If not, could you please provide more details about the resource and region you are working with?

Regarding the issue you mentioned, I tried using Azure TTS to reproduce it on my end, but it seems to be working fine. It's possible that this is an intermittent issue, so I suggest trying again after some time. If the issue persists, please let me know and we can work together to find a solution.

I hope you understand. Do let us know if you any further queries.

Do not forget to click Accept Answer and Yes for was this answer helpful.

Vijay Sheth 0 Reputation points

2024-06-24T10:20:58.4833333+00:00

Have provided my answers below , and it is working, Thanks for help
santoshkc 15,435 Reputation points Microsoft External Staff Moderator

2024-06-24T12:25:39.59+00:00

Hi @Vijay Sheth,

Great to hear that your issue has been resolved and that my response was helpful! I request you to retake the survey on the earlier above response. Thank you.

Answer 2

Here is the code which i am using

import streamlit as st
import speech_recognition as sr
import openai
import pandas as pd
import docx
import PyPDF2
import azure.cognitiveservices.speech as speechsdk
import tempfile
import os
from pydub import AudioSegment
from pydub.playback import play

# Set your OpenAI API key
openai.api_key = ''


# Initialize session state
if "stop_execution" not in st.session_state:
    st.session_state.stop_execution = False
if "tmp_file_path" not in st.session_state:
    st.session_state.tmp_file_path = None
if "playing" not in st.session_state:
    st.session_state.playing = False
if "audio_thread" not in st.session_state:
    st.session_state.audio_thread = None

# Azure Cognitive Services configuration
AZURE_SPEECH_KEY = ''
AZURE_SERVICE_REGION = 'australiaeast'

# Function to capture voice input
def capture_voice():
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        st.write("Listening...")
        audio = recognizer.listen(source)

    try:
        st.write("Recognizing...")
        text = recognizer.recognize_google(audio)
        st.write(f"You said: {text}")
        return text
    except sr.UnknownValueError:
        st.write("Sorry, I could not understand the audio.")
        return None
    except sr.RequestError:
        st.write("Could not request results; check your network connection.")
        return None

# Function to query OpenAI's GPT-3
def query_chat_model(prompt):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt},
            ],
            max_tokens=150
        )
        return response.choices[0].message['content'].strip()
    except openai.error.RateLimitError:
        st.write("Rate limit exceeded. Please wait and try again later.")
        return "Rate limit exceeded. Please try again later."
    except Exception as e:
        st.write(f"Error querying the OpenAI API: {e}")
        return "An error occurred while querying the OpenAI API."

# Function to read content from an Excel file
def read_excel_file(file):
    try:
        df = pd.read_excel(file, engine='openpyxl')
        return df
    except Exception as e:
        st.write(f"Error reading the Excel file: {e}")
        return None

# Function to read content from a PDF file
def read_pdf_file(file):
    try:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        st.write(f"Error reading the PDF file: {e}")
        return None

# Function to read content from a Word file
def read_word_file(file):
    try:
        doc = docx.Document(file)
        text = ""
        for para in doc.paragraphs:
            text += para.text + "\n"
        return text
    except Exception as e:
        st.write(f"Error reading the Word file: {e}")
        return None

# Function to read content from a TXT file
def read_txt_file(file):
    try:
        text = file.read().decode("utf-8")
        return text
    except Exception as e:
        st.write(f"Error reading the TXT file: {e}")
        return None

# Function to generate a response using GPT-3 and the provided content
def generate_response(query, content):
    prompt = f"Using the following data: {content}, answer the question: {query}"
    response = query_chat_model(prompt)
    return response

# Function to convert text to speech using Azure Cognitive Services and play it
def text_to_speech(text, lang='en'):
    if not text:
        st.write("No text to speak.")
        return

    tmp_file_path = None
    try:
        speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SERVICE_REGION)
        audio_config = speechsdk.audio.AudioOutputConfig(use_default_speaker=True)  # Explicitly use default speaker

        synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
        result = synthesizer.speak_text_async(text).get()

        if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted:
            with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
                tmp_file.write(result.audio_data)
                tmp_file_path = tmp_file.name

            # Convert the WAV file to MP3 using pydub
            audio = AudioSegment.from_wav(tmp_file_path)
            mp3_file_path = tmp_file_path.replace('.wav', '.mp3')
            audio.export(mp3_file_path, format='mp3')

            # Play sound using pydub
            play_audio(mp3_file_path)

        else:
            st.write(f"Speech synthesis failed: {result.reason}")

    except Exception as e:
        st.write(f"Error in text-to-speech conversion: {e}")
    finally:
        return tmp_file_path

# Function to play audio file
def play_audio(file_path):
    if file_path and os.path.exists(file_path):
        audio = AudioSegment.from_mp3(file_path)
        st.session_state.playing = True
        play(audio)
        st.session_state.playing = False
        os.remove(file_path)

# Function to stop the playback
def stop_playback():
    if st.session_state.playing:
        st.session_state.playing = False
        st.write("Playback stopped")

# Main function to integrate all functionalities
def main():
    st.title("Voice Controlled GPT-3 with File Data")

    uploaded_file = st.file_uploader("Upload a file", type=["xlsx", "pdf", "docx", "txt"])
    file_content = None

    if uploaded_file is not None:
        file_type = uploaded_file.name.split('.')[-1]
        if file_type == "xlsx":
            df = read_excel_file(uploaded_file)
            if df is not None:
                st.write("Excel Data:")
                st.dataframe(df)
                file_content = df.to_string(index=False)
        elif file_type == "pdf":
            file_content = read_pdf_file(uploaded_file)
        elif file_type == "docx":
            file_content = read_word_file(uploaded_file)
        elif file_type == "txt":
            file_content = read_txt_file(uploaded_file)

    col1, col2 = st.columns(2)
    with col1:
        if st.button("Speak", key="speak_button"):
            st.session_state.stop_execution = False
            voice_input = capture_voice()
            if st.session_state.stop_execution:
                return
            if voice_input and file_content:
                response_text = generate_response(voice_input, file_content)
                st.write(f"Response: {response_text}")
                text_to_speech(response_text)
            elif voice_input:
                st.write("Please upload a file first.")
            else:
                st.write("Failed to capture voice input.")
    
    with col2:
        if st.button("Stop", key="stop_button"):
            stop_playback()

if __name__ == "__main__":
    main()

santoshkc 15,435 Reputation points Microsoft External Staff Moderator

2024-06-24T05:46:32.1833333+00:00

Hi @Vijay Sheth,

Following up to see if the above response was helpful. In case if you have any resolution, please do share that same with the community as it can be helpful to others.

I kindly request you to retake the survey on the earlier response. Thank you for your time and consideration.

Answer 3

# File: app.py
import streamlit as st
import speech_recognition as sr
import openai
import pandas as pd
import docx
import PyPDF2
from azure.cognitiveservices.speech import SpeechConfig, SpeechSynthesizer, AudioDataStream, ResultReason
import tempfile
import os
import base64

# Set your OpenAI API key
openai.api_key = "
"
azure_speech_key = ""
azure_service_region = ""

# Function to capture voice input
def capture_voice():
    recognizer = sr.Recognizer()
    with sr.Microphone() as source:
        st.write("Listening...")
        audio = recognizer.listen(source)

    try:
        st.write("Recognizing...")
        text = recognizer.recognize_google(audio)
        st.write(f"You said: {text}")
        return text.lower()  # Convert to lower case for easier comparison
    except sr.UnknownValueError:
        st.error("Sorry, I could not understand the audio.")
        return None
    except sr.RequestError:
        st.error("Could not request results; check your network connection.")
        return None

# Function to query OpenAI's GPT-3
def query_chat_model(prompt):
    try:
        response = openai.ChatCompletion.create(
            model="gpt-3.5-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt},
            ],
            max_tokens=150
        )
        return response.choices[0].message['content'].strip()
    except openai.error.RateLimitError:
        st.error("Rate limit exceeded. Please wait and try again later.")
        return "Rate limit exceeded. Please try again later."
    except Exception as e:
        st.error(f"Error querying the OpenAI API: {e}")
        return "An error occurred while querying the OpenAI API."

# Function to read content from an Excel file
def read_excel_file(file):
    try:
        df = pd.read_excel(file, engine='openpyxl')
        return df
    except Exception as e:
        st.error(f"Error reading the Excel file: {e}")
        return None

# Function to read content from a PDF file
def read_pdf_file(file):
    try:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
        return text
    except Exception as e:
        st.error(f"Error reading the PDF file: {e}")
        return None

# Function to read content from a Word file
def read_word_file(file):
    try:
        doc = docx.Document(file)
        text = ""
        for para in doc.paragraphs:
            text += para.text + "\n"
        return text
    except Exception as e:
        st.error(f"Error reading the Word file: {e}")
        return None

# Function to read content from a TXT file
def read_txt_file(file):
    try:
        text = file.read().decode("utf-8")
        return text
    except Exception as e:
        st.error(f"Error reading the TXT file: {e}")
        return None

# Function to generate a response using GPT-3 and the provided content
def generate_response(query, content):
    prompt = f"Using the following data: {content}, answer the question: {query}"
    response = query_chat_model(prompt)
    return response

# Function to convert text to speech and get audio file path
def text_to_speech(text):
    if not text:
        st.error("No text to speak.")
        return None

    try:
        speech_config = SpeechConfig(subscription=azure_speech_key, region=azure_service_region)
        speech_config.speech_synthesis_voice_name = "en-US-JennyNeural"
        synthesizer = SpeechSynthesizer(speech_config=speech_config, audio_config=None)

        result = synthesizer.speak_text_async(text).get()
        if result.reason == ResultReason.SynthesizingAudioCompleted:
            audio_stream = AudioDataStream(result)
            temp_audio_path = os.path.join(tempfile.gettempdir(), "output.wav")
            audio_stream.save_to_wav_file(temp_audio_path)
            return temp_audio_path
        else:
            st.error("Error during text-to-speech conversion")
            return None
    except Exception as e:
        st.error(f"Error in text-to-speech conversion: {e}")
        return None

# Function to play audio automatically
def play_audio(file_path):
    try:
        audio_file = open(file_path, 'rb').read()
        audio_base64 = base64.b64encode(audio_file).decode('utf-8')
        audio_html = f'<audio autoplay="true" controls><source src="data:audio/wav;base64,{audio_base64}" type="audio/wav"></audio>'
        st.markdown(audio_html, unsafe_allow_html=True)
    except Exception as e:
        st.error(f"Error playing audio: {e}")

# Main function to integrate all functionalities
def main():
    st.title("Voice Controlled GPT-3 with File Data")

    response_container = st.container()
    upload_container = st.container()

    with response_container:
        if st.button("Speak"):
            voice_input = capture_voice()
            st.session_state.voice_input = voice_input

        if "response_text" in st.session_state:
            st.write(f"Response: {st.session_state.response_text}")
            audio_file_path = text_to_speech(st.session_state.response_text)
            if audio_file_path:
                play_audio(audio_file_path)

    with upload_container:
        uploaded_file = st.file_uploader("Upload a file", type=["xlsx", "pdf", "docx", "txt"])
        file_content = None

        if uploaded_file is not None:
            file_type = uploaded_file.name.split('.')[-1]
            if file_type == "xlsx":
                df = read_excel_file(uploaded_file)
                if df is not None:
                    file_content = df.to_string(index=False)
                    st.session_state.file_content = file_content
                    st.write("Excel Data:")
                    st.dataframe(df)
            elif file_type == "pdf":
                file_content = read_pdf_file(uploaded_file)
                if file_content:
                    st.session_state.file_content = file_content
                    st.write("PDF Content:")
                    st.write(file_content)
            elif file_type == "docx":
                file_content = read_word_file(uploaded_file)
                if file_content:
                    st.session_state.file_content = file_content
                    st.write("Word Document Content:")
                    st.write(file_content)
            elif file_type == "txt":
                file_content = read_txt_file(uploaded_file)
                if file_content:
                    st.session_state.file_content = file_content
                    st.write("Text File Content:")
                    st.write(file_content)

    if "voice_input" in st.session_state and "file_content" in st.session_state:
        response_text = generate_response(st.session_state.voice_input, st.session_state.file_content)
        st.session_state.response_text = response_text

        with response_container:
            st.write(f"Response: {response_text}")
            audio_file_path = text_to_speech(response_text)
            if audio_file_path:
                play_audio(audio_file_path)

if __name__ == "__main__":
    main()

This is working now , voice is not getting repeated.

Share via

The Azure Cognitive Services Speech is repeating the audio twice

3 answers

Your answer