In this front-end code, I’m using TTS (Text-to-Speech) and STT (Speech-to-Text) to enable live voice-to-voice chat. However, I’m facing an issue: when the user asks a new question while the TTS is still speaking the response to a previous one, both audios overlap and play simultaneously. I want to handle this kind of user interruption properly.
The code:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Live Carrier Chat</title>
<script src="https://aka.ms/csspeech/jsbrowserpackageraw"></script>
<style>
body { font-family: sans-serif; text-align: center; }
button {
margin: 20px;
padding: 15px 30px;
font-size: 1.2em;
border-radius: 25px;
cursor: pointer;
background: #2196F3;
color: white;
border: none;
transition: all 0.3s ease;
}
button:hover { background: #1976D2; transform: scale(1.05); }
button:active { transform: scale(0.95); }
</style>
</head>
<body>
<h1>Live Carrier Chat</h1>
<button id="start-btn">Start Call</button>
<script>
const SPEECH_KEY =
const SPEECH_REGION =
// Initialize speech components
const speechConfig = SpeechSDK.SpeechConfig.fromSubscription(SPEECH_KEY, SPEECH_REGION);
speechConfig.speechRecognitionLanguage = "ar-EG";
speechConfig.speechSynthesisVoiceName = "ar-EG-ShakirNeural";
const audioInput = SpeechSDK.AudioConfig.fromDefaultMicrophoneInput();
let recognizer = new SpeechSDK.SpeechRecognizer(speechConfig, audioInput);
// State management
let synthesisResources = { player: null, synthesizer: null };
let recognitionState = { isProcessing: false, isSpeaking: false };
let eventSource = null;
const startBtn = document.getElementById('start-btn');
async function handleUserInput(query) {
if (recognitionState.isProcessing) return;
recognitionState.isProcessing = true;
try {
// Cleanup previous session
eventSource?.close();
await cleanupAudioResources();
// Start new session
eventSource = new EventSource(`/stream?query=${encodeURIComponent(query)}`);
eventSource.onmessage = async (event) => {
const data = JSON.parse(event.data);
if (data.error) throw new Error(data.error);
if (data.response) await handleServerResponse(data.response);
};
} catch (error) {
console.error('Processing error:', error);
recognitionState.isProcessing = false;
}
}
async function handleServerResponse(response) {
eventSource.close();
await synthesizeSpeech(response);
await restartRecognition();
}
async function synthesizeSpeech(text) {
recognitionState.isSpeaking = true;
synthesisResources.player = new SpeechSDK.SpeakerAudioDestination();
const audioConfig = SpeechSDK.AudioConfig.fromSpeakerOutput(synthesisResources.player);
synthesisResources.synthesizer = new SpeechSDK.SpeechSynthesizer(speechConfig, audioConfig);
return new Promise((resolve) => {
synthesisResources.synthesizer.speakTextAsync(text,
result => {
cleanupSynthesisResources();
resolve();
},
error => {
console.error('Synthesis error:', error);
cleanupSynthesisResources();
resolve();
}
);
});
}
async function restartRecognition() {
recognitionState.isProcessing = false;
recognitionState.isSpeaking = false;
await new Promise(resolve => setTimeout(resolve, 500));
await startRecognition();
}
async function cleanupAudioResources() {
await stopSynthesis();
await stopRecognition();
cleanupSynthesisResources();
}
async function stopSynthesis() {
if (synthesisResources.synthesizer) {
await synthesisResources.synthesizer.close();
}
if (synthesisResources.player) {
synthesisResources.player.pause();
}
}
function cleanupSynthesisResources() {
synthesisResources.player = null;
synthesisResources.synthesizer = null;
recognitionState.isSpeaking = false;
}
async function stopRecognition() {
await recognizer.stopContinuousRecognitionAsync();
}
async function startRecognition() {
await recognizer.startContinuousRecognitionAsync();
}
recognizer.recognized = async (s, e) => {
if (e.result.reason === SpeechSDK.ResultReason.RecognizedSpeech) {
const text = e.result.text.trim().toLowerCase();
if (!text) return;
if (text === "خروج") {
await shutdownSystem();
return;
}
if (!recognitionState.isProcessing) {
await handleUserInput(text);
}
}
};
async function shutdownSystem() {
await cleanupAudioResources();
startBtn.textContent = "Start Call";
const goodbyeSynth = new SpeechSDK.SpeechSynthesizer(speechConfig);
goodbyeSynth.speakTextAsync("مع السلامة");
}
startBtn.addEventListener('click', async () => {
if (startBtn.textContent === "Start Call") {
startBtn.textContent = "Stop Call";
await startRecognition();
} else {
await shutdownSystem();
}
});
// Audio context initialization
(function initializeAudio() {
new (window.AudioContext || window.webkitAudioContext)();
})();
</script>
</body>
</html>