Pronunciation Assessment SDK returns low score with ErrorType: 'Mispronunciation' compared to Speech Studio.
In my NodeJS app and microsoft-cognitiveservices-speech-sdk
package, Azure's Pronunciation Assessment for Japanese (It works fine when I test it in English.) always returns ErrorType: 'Mispronunciation' and a very low score (around 50) even though it usually returns around 90-100 when tested on Speech Studio.
I tried making the audio format to be the same as what's used on Studio.
channels = 1, bitsPerSample = 16, samplesPerSecond = 16000
I also tried downloading the audio file from the studio and tested it with my app which also returns the same result.
{
Word: 'こんにちは',
Offset: 5700000,
Duration: 11700000,
PronunciationAssessment: { AccuracyScore: 53, ErrorType: 'Mispronunciation' },
Phonemes: [
[Object], [Object],
[Object], [Object],
[Object], [Object],
[Object], [Object],
[Object]
]
}
{
accuracyScore: 54,
fluencyScore: 100,
completenessScore: 0,
pronunciationScore: 30.8
}
completenessScore
is always 0 as well.
Here is my code.
async azureGradeSpeech(
input: GradeSpeechInput
): Promise<SpeechScore> {
const { text, audioFilePath } = input;
const speechConfig = SpeechSDK.SpeechConfig.fromSubscription(
azureSubscriptionKey,
azureRegion
);
const buffer = await this.readLocalFileAsBuffer(audioFilePath);
const audioConfig = SpeechSDK.AudioConfig.fromWavFileInput(buffer);
const audAudioConfig =
SpeechSDK.AutoDetectSourceLanguageConfig.fromLanguages(["ja-JP"]);
const speechRecognizer = SpeechSDK.SpeechRecognizer.FromConfig(
speechConfig,
audAudioConfig,
audioConfig
);
const resultConfig = {
referenceText: text,
gradingSystem: "HundredMark",
granularity: "Phoneme", // Phoneme, Syllable, Word, FullText
// EnableMiscue: true,
};
const pronunciationAssessmentConfig =
SpeechSDK.PronunciationAssessmentConfig.fromJSON(
JSON.stringify(resultConfig)
);
pronunciationAssessmentConfig.applyTo(speechRecognizer);
const result: SpeechScore = await new Promise((resolve, reject) => {
speechRecognizer.recognizeOnceAsync(
(speechRecognitionResult: SpeechSDK.SpeechRecognitionResult) => {
// The pronunciation assessment result as a Speech SDK object
const pronunciationAssessmentResult =
SpeechSDK.PronunciationAssessmentResult.fromResult(
speechRecognitionResult
);
const pronunciationAssessmentResultJson =
speechRecognitionResult.properties.getProperty(
SpeechSDK.PropertyId.SpeechServiceResponse_JsonResult
);
const jsonResult = JSON.parse(pronunciationAssessmentResultJson);
const words = jsonResult.NBest[0].Words;
const length = words.length;
const totalAccuracyScore = words.reduce(
(accumulator, curr) =>
curr.PronunciationAssessment.AccuracyScore + accumulator,
0
);
const average = Number((totalAccuracyScore / length).toFixed(0));
const score: SpeechScore = {
averageAccuracyScore: average,
accuracyScore: pronunciationAssessmentResult.accuracyScore,
fluencyScore: pronunciationAssessmentResult.fluencyScore,
completenessScore: pronunciationAssessmentResult.completenessScore,
pronunciationScore:
pronunciationAssessmentResult.pronunciationScore,
};
resolve(score);
},
(error) => {
reject(error);
}
);
});
console.log("result", result);
return result;
}