I'm trying to use the pronunciationAssessment
feature in the Azure Speech SDK, but I cannot get reasonable result.
I've tested this with the word "school" and other words as well, but I always get a result of 0
—no matter whether the word was correctly spoken or not. I generated the audio files using this Text-to-Audio tool, so this should be easily reproducible.
Does anyone have any idea why the accuracy score is always 0, or what I might be missing?
namespace PronunciationAssessmentDemo
{
class Program
{
public static AudioConfig CreateAudioConfigFromBytes(byte[] audioBytes)
{
var audioStream = new MemoryStream(audioBytes);
var pushStream = AudioInputStream.CreatePushStream();
pushStream.Write(audioBytes);
pushStream.Close();
var audioConfig = AudioConfig.FromStreamInput(pushStream);
return audioConfig;
}
public static async Task<float> AssessPronunciation(byte[] audioBytes, string referenceText)
{
string subscriptionKey = Environment.GetEnvironmentVariable("STT_API_KEY");
string region = "eastus";
var pronunciationAssessmentConfig = new PronunciationAssessmentConfig(
referenceText: referenceText,
gradingSystem: GradingSystem.HundredMark,
granularity: Granularity.Phoneme,
enableMiscue: false);
var audioConfig = CreateAudioConfigFromBytes(audioBytes);
var speechConfig = SpeechConfig.FromSubscription(subscriptionKey, region);
using (var speechRecognizer = new SpeechRecognizer(speechConfig, audioConfig))
{
pronunciationAssessmentConfig.ApplyTo(speechRecognizer);
var speechRecognitionResult = await speechRecognizer.RecognizeOnceAsync();
if (speechRecognitionResult.Reason == ResultReason.RecognizedSpeech)
{
Console.WriteLine("Recognized: " + speechRecognitionResult.Text);
var pronunciationAssessmentResult = PronunciationAssessmentResult.FromResult(speechRecognitionResult);
Console.WriteLine($"Accuracy Score: {pronunciationAssessmentResult.AccuracyScore}");
return (float)pronunciationAssessmentResult.AccuracyScore;
}
else
{
Console.WriteLine($"Recognition failed: {speechRecognitionResult.Reason}");
return 0;
}
}
}
static async Task Main(string[] args)
{
string audioFilePath = "...wwwroot\\audio\\school.wav";
string referenceText = "school";
byte[] audioBytes = File.ReadAllBytes(audioFilePath);
float accuracyScore = await AssessPronunciation(audioBytes, referenceText);
Console.WriteLine($"Final Accuracy Score: {accuracyScore}");
}
}
}
using Microsoft.CognitiveServices.Speech.Audio;
namespace PronunciationAssessmentDemo
{
class Program
{
public static AudioConfig CreateAudioConfigFromBytes(byte[] audioBytes)
{
var audioStream = new MemoryStream(audioBytes);
var pushStream = AudioInputStream.CreatePushStream();
pushStream.Write(audioBytes);
pushStream.Close();
var audioConfig = AudioConfig.FromStreamInput(pushStream);
return audioConfig;
}