question

KennethArakelian-8284 avatar image
1 Vote"
KennethArakelian-8284 asked 22580266 commented

Using Azure speech to text with nodejs converting mulaw to pcm

I'm using twilio <stream> to send mulaw audio via websocket to my nodejs app and trying to convert mulaw to pcm for azure speech to text. my code is below. the app connects and sends audio but the speech to text results are empty or nomatch.

 #!/usr/bin/env node
 const WebSocket = require("ws");
 const alawmulaw = require("alawmulaw");
 const base64 = require("js-base64");
 const express = require("express");
 const app = express();
 const server = require("http").createServer(app);
 const wss = new WebSocket.Server({ server });
 const subscriptionKey = "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxx";
 const serviceRegion = "eastus"; // e.g., "westus"
 const language = "en-US";
    
 const sdk = require("microsoft-cognitiveservices-speech-sdk");
 const azurePusher = sdk.AudioInputStream.createPushStream(sdk.AudioStreamFormat.getWaveFormatPCM(8000, 16, 1));
 const audioConfig = sdk.AudioConfig.fromStreamInput(azurePusher);
    
 const speechConfig = sdk.SpeechConfig.fromSubscription(subscriptionKey,serviceRegion);
    
 speechConfig.speechRecognitionLanguage = language;
 var recognizer = new sdk.SpeechRecognizer(speechConfig, audioConfig);
    
 recognizer.recognizing = (s, e) => {
     console.log(`RECOGNIZING: Text=${e.result.text}`);
 };
    
 recognizer.recognized = (s, e) => {
     if (e.result.reason == sdk.ResultReason.RecognizedSpeech) {
         console.log(`RECOGNIZED: Text=${e.result.text}`);
     }
     else if (e.result.reason == sdk.ResultReason.NoMatch) {
         console.log("NOMATCH: Speech could not be recognized.");
     }
 };
    
 recognizer.canceled = (s, e) => {
     console.log(`CANCELED: Reason=${e.reason}`);
    
     if (e.reason == sdk.CancellationReason.Error) {
         console.log(`"CANCELED: ErrorCode=${e.errorCode}`);
         console.log(`"CANCELED: ErrorDetails=${e.errorDetails}`);
         console.log("CANCELED: Did you update the key and location/region info?");
     }
    
     recognizer.stopContinuousRecognitionAsync();
 };
    
 recognizer.sessionStopped = (s, e) => {
     console.log("\n    Session stopped event.");
     recognizer.stopContinuousRecognitionAsync();
 };
    
 recognizer.startContinuousRecognitionAsync(() => {
     console.log("Continuous Reco Started");
 },
     err => {
         console.trace("err - " + err);
         recognizer.close();
         recognizer = undefined;
     });
    
 // Handle Web Socket Connection
 wss.on("connection", function connection(ws) {
     console.log("New Connection Initiated");
    
     ws.on("message", function incoming(message) {
         const msg = JSON.parse(message);
         switch (msg.event) {
             case "connected":
                 console.log(`A new call has connected.`);
    
                 break;
             case "start":
                 console.log(`Starting Media Stream ${msg.streamSid}`);
                 break;
             case "media":
                 //process.stdout.write(msg.media.payload + "|");
                 //console.log("\n---------new media----------");
                 //console.log(msg.media.payload);
                 let streampayload = base64.decode(msg.media.payload);
                 //console.log(streampayload);
                 let mulawdata = Buffer.from(streampayload);
                 //console.log("\n---------mulaw " + mulawdata.length.toString() + "----------");
                 for (let a of mulawdata.values()) {
                     //process.stdout.write(a.toString() + "|");
                 }
                 //console.log(mulawdata.slice());
                 let pcmdata = Buffer.from(alawmulaw.mulaw.decode(mulawdata));
                 //console.log("\n---------pcm " + pcmdata.length.toString() + "----------");
                 azurePusher.write(pcmdata);
                 break;
             case "stop":
                 console.log(`Call Has Ended`);
                 azurePusher.close();
                 recognizer.stopContinuousRecognitionAsync();
                 break;
         }
     });
    
 });
    
 //Handle HTTP Request
 app.get("/", (req, res) => res.send("Hello World"));
    
 console.log("Listening at Port 8080");
 server.listen(8080);
azure-speech
· 3
5 |1600 characters needed characters left characters exceeded

Up to 10 attachments (including images) can be used with a maximum of 3.0 MiB each and 30.0 MiB total.

@KennethArakelian-8284 I have come across an issue on speech SDK repo that discusses the support of Twilio stream with speech SDK. It looks like the support for this scenario is still in the pipeline. However a workaround is provided by the user using gstreamer on their repo for .NET, I will check internally if a similar solution is available for nodejs or you could post an issue or feature on speech SDK repo for the same. Thanks!!


0 Votes 0 ·

Thank you - I really appreciate it!

0 Votes 0 ·
22580266 avatar image 22580266 KennethArakelian-8284 ·

Hi.
I'm encountering the same phenomenon, but what should I do with this problem after all?
I would be grateful if you could tell me.

0 Votes 0 ·

0 Answers