Azure TTS generating garbled result when requesting Opus encoding
The following sample code (C++, Linux, x64) uses the MS Speech SDK to request a text-to-speech of a single sentence in Opus format with no container. It then uses the Opus lib to decode to raw PCM. Everything seems to run with no errors but the result sounds garbled, as if some of the audio is missing, and the result "Done, got 14880 bytes, decoded to 24000 bytes" looks like this might be a decoding issue rather than an Azure issue as I'd expect a much higher compression ratio.
Can someone explain what I might be doing wrong?
Note that this generates a raw PCM file, play back with: aplay out.raw -f S16_LE -r 24000 -c 1
#include <stdio.h>
#include <string>
#include <assert.h>
#include <vector>
#include <speechapi_cxx.h>
#include <opus.h>
using namespace Microsoft::CognitiveServices::Speech;
static const std::string subscription_key = "abcd1234"; // insert valid key here
static const std::string service_region = "westus";
static const std::string text = "Hi, this is Azure";
static const int sample_rate = 24000;
#define MAX_FRAME_SIZE 6*960 // from Opus trivial_example.c
int main(int argc, char **argv) {
// create Opus decoder
int err;
OpusDecoder* opus_decoder = opus_decoder_create(sample_rate, 1, &err);
assert(err == OPUS_OK);
// create Azure client
auto azure_speech_config = SpeechConfig::FromSubscription(subscription_key, service_region);
azure_speech_config->SetSpeechSynthesisVoiceName("en-US-JennyNeural");
azure_speech_config->SetSpeechSynthesisOutputFormat(SpeechSynthesisOutputFormat::Audio24Khz16Bit48KbpsMonoOpus);
auto azure_synth = SpeechSynthesizer::FromConfig(azure_speech_config, NULL);
FILE* fp = fopen("out.raw", "w");
int in_bytes=0, decoded_bytes=0;
// callback to capture incoming packets
azure_synth->Synthesizing += [&in_bytes, &decoded_bytes, fp, opus_decoder](const SpeechSynthesisEventArgs& e) {
printf("Synthesizing event received with audio chunk of %zu bytes\n", e.Result->GetAudioData()->size());
auto audio_data = e.Result->GetAudioData();
in_bytes += audio_data->size();
// confirm that this is exactly one valid Opus packet
assert(opus_packet_get_nb_frames((const unsigned char*)audio_data->data(), audio_data->size()) == 1);
// decode the packet
std::vector<uint8_t> decoded_data(MAX_FRAME_SIZE);
int decoded_frame_size = opus_decode(opus_decoder, (const unsigned char*)audio_data->data(), audio_data->size(),
(opus_int16*)decoded_data.data(), decoded_data.size()/sizeof(opus_int16), 0);
assert(decoded_frame_size > 0); // confirm no decode error
decoded_frame_size *= sizeof(opus_int16); // result size is in samples, convert to bytes
printf("Decoded to %d bytes\n", decoded_frame_size);
assert(decoded_frame_size <= (int)decoded_data.size());
fwrite(decoded_data.data(), 1, decoded_frame_size, fp);
decoded_bytes += decoded_frame_size;
};
// perform TTS
auto result = azure_synth->SpeakText(text);
printf("Done, got %d bytes, decoded to %d bytes\n", in_bytes, decoded_bytes);
// cleanup
fclose(fp);
opus_decoder_destroy(opus_decoder);
}