Handle the user interruption

Kirollos Saleh 0 Reputation points
2025-04-06T14:03:00.08+00:00

In this front-end code, I’m using TTS (Text-to-Speech) and STT (Speech-to-Text) to enable live voice-to-voice chat. However, I’m facing an issue: when the user asks a new question while the TTS is still speaking the response to a previous one, both audios overlap and play simultaneously. I want to handle this kind of user interruption properly.

The code:
<!DOCTYPE html>

<html lang="en">

<head>

<meta charset="UTF-8">

<title>Live Carrier Chat</title>

<script src="https://aka.ms/csspeech/jsbrowserpackageraw"></script>

<style>

body { font-family: sans-serif; text-align: center; }

button { 

  margin: 20px; 

  padding: 15px 30px;

  font-size: 1.2em;

  border-radius: 25px;

  cursor: pointer;

  background: #2196F3;

  color: white;

  border: none;

  transition: all 0.3s ease;

}

button:hover { background: #1976D2; transform: scale(1.05); }

button:active { transform: scale(0.95); }

</style>

</head>

<body>

<h1>Live Carrier Chat</h1>

<button id="start-btn">Start Call</button>

<script>

const SPEECH_KEY = 

const SPEECH_REGION = 



// Initialize speech components

const speechConfig = SpeechSDK.SpeechConfig.fromSubscription(SPEECH_KEY, SPEECH_REGION);

speechConfig.speechRecognitionLanguage = "ar-EG";

speechConfig.speechSynthesisVoiceName = "ar-EG-ShakirNeural";

const audioInput = SpeechSDK.AudioConfig.fromDefaultMicrophoneInput();

let recognizer = new SpeechSDK.SpeechRecognizer(speechConfig, audioInput);



// State management

let synthesisResources = { player: null, synthesizer: null };

let recognitionState = { isProcessing: false, isSpeaking: false };

let eventSource = null;

const startBtn = document.getElementById('start-btn');

async function handleUserInput(query) {

  if (recognitionState.isProcessing) return;

  recognitionState.isProcessing = true;

  try {

    // Cleanup previous session

    eventSource?.close();

    await cleanupAudioResources();

    

    // Start new session

    eventSource = new EventSource(`/stream?query=${encodeURIComponent(query)}`);

    

    eventSource.onmessage = async (event) => {

      const data = JSON.parse(event.data);

      if (data.error) throw new Error(data.error);

      if (data.response) await handleServerResponse(data.response);

    };

  } catch (error) {

    console.error('Processing error:', error);

    recognitionState.isProcessing = false;

  }

}

async function handleServerResponse(response) {

  eventSource.close();

  await synthesizeSpeech(response);

  await restartRecognition();

}

async function synthesizeSpeech(text) {

  recognitionState.isSpeaking = true;

  

  synthesisResources.player = new SpeechSDK.SpeakerAudioDestination();

  const audioConfig = SpeechSDK.AudioConfig.fromSpeakerOutput(synthesisResources.player);

  synthesisResources.synthesizer = new SpeechSDK.SpeechSynthesizer(speechConfig, audioConfig);

  return new Promise((resolve) => {

    synthesisResources.synthesizer.speakTextAsync(text,

      result => {

        cleanupSynthesisResources();

        resolve();

      },

      error => {

        console.error('Synthesis error:', error);

        cleanupSynthesisResources();

        resolve();

      }

    );

  });

}

async function restartRecognition() {

  recognitionState.isProcessing = false;

  recognitionState.isSpeaking = false;

  await new Promise(resolve => setTimeout(resolve, 500));

  await startRecognition();

}

async function cleanupAudioResources() {

  await stopSynthesis();

  await stopRecognition();

  cleanupSynthesisResources();

}

async function stopSynthesis() {

  if (synthesisResources.synthesizer) {

    await synthesisResources.synthesizer.close();

  }

  if (synthesisResources.player) {

    synthesisResources.player.pause();

  }

}

function cleanupSynthesisResources() {

  synthesisResources.player = null;

  synthesisResources.synthesizer = null;

  recognitionState.isSpeaking = false;

}

async function stopRecognition() {

  await recognizer.stopContinuousRecognitionAsync();

}

async function startRecognition() {

  await recognizer.startContinuousRecognitionAsync();

}

recognizer.recognized = async (s, e) => {

  if (e.result.reason === SpeechSDK.ResultReason.RecognizedSpeech) {

    const text = e.result.text.trim().toLowerCase();

    if (!text) return;

    if (text === "خروج") {

      await shutdownSystem();

      return;

    }

    if (!recognitionState.isProcessing) {

      await handleUserInput(text);

    }

  }

};

async function shutdownSystem() {

  await cleanupAudioResources();

  startBtn.textContent = "Start Call";

  const goodbyeSynth = new SpeechSDK.SpeechSynthesizer(speechConfig);

  goodbyeSynth.speakTextAsync("مع السلامة");

}

startBtn.addEventListener('click', async () => {

  if (startBtn.textContent === "Start Call") {

    startBtn.textContent = "Stop Call";

    await startRecognition();

  } else {

    await shutdownSystem();

  }

});

// Audio context initialization

(function initializeAudio() {

  new (window.AudioContext || window.webkitAudioContext)();

})();

</script>

</body>

</html>

Azure AI Speech
Azure AI Speech
An Azure service that integrates speech processing into apps and services.
1,974 questions
0 comments No comments
{count} votes

2 answers

Sort by: Most helpful
  1. Amira Bedhiafi 31,391 Reputation points
    2025-04-07T11:34:37.0666667+00:00

    Hello Kirollos !

    Thank you for posting on Microsoft Learn.

    You need to stop the ongoing speech synthesis the moment the user starts speaking again. Right now, your code overlaps the audio because it doesn’t interrupt synthesizeSpeech() when STT (speech recognition) picks up new input.

    Modify handleUserInput() to force-stop any active speech synthesis:

    async function handleUserInput(query) {
      if (recognitionState.isProcessing || recognitionState.isSpeaking) {
        console.log("Interrupting previous speech...");
        await cleanupAudioResources(); // <-- stop any active TTS
      }
    
      recognitionState.isProcessing = true;
      try {
        eventSource?.close();
        eventSource = new EventSource(`/stream?query=${encodeURIComponent(query)}`);
        eventSource.onmessage = async (event) => {
          const data = JSON.parse(event.data);
          if (data.error) throw new Error(data.error);
          if (data.response) await handleServerResponse(data.response);
        };
      } catch (error) {
        console.error('Processing error:', error);
        recognitionState.isProcessing = false;
      }
    }
    
    

    Make sure stopSynthesis() completely shuts off audio playback:

    async function stopSynthesis() {
      if (synthesisResources.synthesizer) {
        synthesisResources.synthesizer.close();
        synthesisResources.synthesizer = null;
      }
    
      if (synthesisResources.player) {
        synthesisResources.player.pause();
        synthesisResources.player = null;
      }
    
      recognitionState.isSpeaking = false;
    }
    
    

    I recommend also that you debounce STT input slightly so it’s not too aggressive with interruptions:

    let lastSpeechTime = 0;
    
    recognizer.recognized = async (s, e) => {
      if (e.result.reason === SpeechSDK.ResultReason.RecognizedSpeech) {
        const now = Date.now();
        if (now - lastSpeechTime < 1000) return; // ignore if less than 1 sec since last input
        lastSpeechTime = now;
    
        const text = e.result.text.trim().toLowerCase();
        if (!text) return;
    
        if (text === "خروج") {
          await shutdownSystem();
          return;
        }
    
        await handleUserInput(text);
      }
    };
    
    

    Also you can briefly flash a message or even say something like “حسنًا، جاري الإجابة على سؤالك الجديد…” to confirm the interruption.


  2. Manas Mohanty 3,005 Reputation points Microsoft External Staff
    2025-04-09T10:22:27.3666667+00:00

    Hi Kirollos Saleh,

    We have noticed that you rated an answer as not helpful. We appreciate your feedback and are committed to improving your experience with the Q&A.

    Have gone through your code shared by you and enabled your code with voice-to-voice functionality finally. It was only having to voice to input initially with minor modifications shown in above screenshot from @santoshkc

    The problem I have not fixed yet is It is asking mic permission repeatedly, but the voice overlap problem is fixed in below code.

    Please check the attached code snippet for review.

    <!DOCTYPE html>
    <html lang="en">
    <head>
    <meta charset="UTF-8">
    <title>Live Carrier Chat</title>
    <script src="https://aka.ms/csspeech/jsbrowserpackageraw"></script>
    <style>
    body { font-family: sans-serif; text-align: center; }
    button { 
      margin: 20px; 
      padding: 15px 30px;
      font-size: 1.2em;
      border-radius: 25px;
      cursor: pointer;
      background: #2196F3;
      color: white;
      border: none;
      transition: all 0.3s ease;
    }
    button:hover { background: #1976D2; transform: scale(1.05); }
    button:active { transform: scale(0.95); }
    #recognized-text {
      margin-top: 20px;
      padding: 10px;
      border: 1px solid #ccc;
      width: 80%;
      margin-left: auto;
      margin-right: auto;
      font-size: 1.2em;
      text-align: left;
      height: 300px; /* Adjusted height to fit approximately 15 lines */
      overflow-y: auto; /* Add scroll bar if content exceeds the height */
    }
    </style>
    </head>
    <body>
    <h1>Live Carrier Chat</h1>
    <button id="start-btn">Start Call</button>
    <div id="recognized-text">Recognized text will appear here...</div>
    <script>
    const SPEECH_KEY = "<apikey>";
    const SPEECH_REGION = "<region>";
    // Initialize speech components
    const speechConfig = SpeechSDK.SpeechConfig.fromSubscription(SPEECH_KEY, SPEECH_REGION);
    speechConfig.speechRecognitionLanguage = "en-US";
    speechConfig.speechSynthesisVoiceName = "en-US-JennyNeural";
    let audioInput = null;
    let recognizer = null;
    // State management
    let synthesisResources = { player: null, synthesizer: null };
    let recognitionState = { isProcessing: false, isSpeaking: false };
    const startBtn = document.getElementById('start-btn');
    const recognizedTextDiv = document.getElementById('recognized-text');
    // Function to synthesize and speak text
    async function synthesizeSpeech(text) {
      recognitionState.isSpeaking = true;
      synthesisResources.player = new SpeechSDK.SpeakerAudioDestination();
      const audioConfig = SpeechSDK.AudioConfig.fromSpeakerOutput(synthesisResources.player);
      synthesisResources.synthesizer = new SpeechSDK.SpeechSynthesizer(speechConfig, audioConfig);
      return new Promise((resolve) => {
        synthesisResources.synthesizer.speakTextAsync(text,
          result => {
            cleanupSynthesisResources();
            resolve();
          },
          error => {
            console.error('Synthesis error:', error);
            cleanupSynthesisResources();
            resolve();
          }
        );
      });
    }
    // Function to restart recognition
    async function restartRecognition() {
      recognitionState.isProcessing = false;
      recognitionState.isSpeaking = false;
      await new Promise(resolve => setTimeout(resolve, 500));
      await startRecognition();
    }
    // Function to cleanup audio resources
    async function cleanupAudioResources() {
      await stopSynthesis();
      await stopRecognition();
      cleanupSynthesisResources();
    }
    // Function to stop synthesis
    async function stopSynthesis() {
      if (synthesisResources.synthesizer) {
        await synthesisResources.synthesizer.close();
      }
      if (synthesisResources.player) {
        synthesisResources.player.pause();
      }
    }
    // Function to cleanup synthesis resources
    function cleanupSynthesisResources() {
      synthesisResources.player = null;
      synthesisResources.synthesizer = null;
      recognitionState.isSpeaking = false;
    }
    // Function to stop recognition
    async function stopRecognition() {
      if (recognizer) {
        await recognizer.stopContinuousRecognitionAsync();
      }
    }
    // Function to start recognition
    async function startRecognition() {
      if (!recognizer) {
        audioInput = SpeechSDK.AudioConfig.fromDefaultMicrophoneInput();
        recognizer = new SpeechSDK.SpeechRecognizer(speechConfig, audioInput);
        recognizer.recognized = async (s, e) => {
          if (e.result.reason === SpeechSDK.ResultReason.RecognizedSpeech) {
            const text = e.result.text.trim();
            recognizedTextDiv.textContent += " " + text; // Append recognized text
            if (!text) return;
            if (text.toLowerCase() === "exit") {
              await shutdownSystem();
              return;
            }
            if (!recognitionState.isProcessing) {
              await stopRecognition(); // Stop recognition before synthesizing speech
              await synthesizeSpeech(text); // Synthesize and speak back the recognized text
              await restartRecognition(); // Restart recognition after synthesis
            }
          }
        };
      }
      await recognizer.startContinuousRecognitionAsync();
    }
    // Function to shutdown the system
    async function shutdownSystem() {
      await cleanupAudioResources();
      startBtn.textContent = "Start Call";
      const goodbyeSynth = new SpeechSDK.SpeechSynthesizer(speechConfig);
      goodbyeSynth.speakTextAsync("Goodbye");
    }
    // Start button event handler
    startBtn.addEventListener('click', async () => {
      if (startBtn.textContent === "Start Call") {
        startBtn.textContent = "Stop Call";
        await startRecognition();
      } else {
        await shutdownSystem();
      }
    });
    // Request microphone access on page load
    navigator.mediaDevices.getUserMedia({ audio: true })
      .then(stream => {
        console.log('Microphone access granted');
      })
      .catch(error => {
        console.error('Microphone access denied', error);
      });
    // Audio context initialization
    (function initializeAudio() {
      new (window.AudioContext || window.webkitAudioContext)();
    })();
    </script>
    </body>
    </html>
    

    It is able to convert my voice inputs to text then to voice after putting the recognized text in text windows.

    Process is like below.

    1. Request Microphone Access: The script requests microphone access when the page loads using navigator.mediaDevices.getUserMedia({ audio: true }).
    2. Initialize Recognizer Once: The recognizer is initialized only once when startRecognition is called for the first time.
    3. Stop Recognition Before Synthesis: The script stops recognition before synthesizing speech to prevent sound mixing.
    4. Restart Recognition After Synthesis: Recognition is restarted after speech synthesis to ensure smooth transitions between speaking and listening.

    Please acknowledge if it helps.

    Thank you.

    0 comments No comments

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.