Hello,
I want to evaluate the reference: "He's your father-in-law, isn't he?", and its standard AI-generated audio file:
https://storage.googleapis.com/eng-speaking-data-test/audio/material/65fab4a069755e2a0d04f71c/65fab8879965f7179048b223/65fab8879965f7179048b223_question.wav
I found that when using Azure's pronunciation assessment to evaluate this AI-generated audio file, the accuracy score I received was significantly lower than expected.
Given that the pronunciation was generated by AI, I would expect a high level of accuracy. I discovered that the lower accuracy might be due to the word with a hyphen, "father-in-law".
Could this issue be due to some error in how I'm using the tool?
Here are the detailed JSON results:
"Words": [
{
"Word": "he's",
"Offset": 500000,
"Duration": 2100000,
"Confidence": 0,
"PronunciationAssessment": {
"AccuracyScore": 100,
"ErrorType": "None"
},
"Syllables": [
{
"Syllable": "hiz",
"Grapheme": "he's",
"PronunciationAssessment": {
"AccuracyScore": 100
},
"Offset": 500000,
"Duration": 2100000
}
],
"Phonemes": [
{
"Phoneme": "h",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "h",
"Score": 100
}
]
},
"Offset": 500000,
"Duration": 900000
},
{
"Phoneme": "i",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "i",
"Score": 100
}
]
},
"Offset": 1500000,
"Duration": 500000
},
{
"Phoneme": "z",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "z",
"Score": 100
}
]
},
"Offset": 2100000,
"Duration": 500000
}
]
},
{
"Word": "your",
"Offset": 2700000,
"Duration": 1700000,
"Confidence": 0,
"PronunciationAssessment": {
"AccuracyScore": 100,
"ErrorType": "None"
},
"Syllables": [
{
"Syllable": "jər",
"Grapheme": "your",
"PronunciationAssessment": {
"AccuracyScore": 100
},
"Offset": 2700000,
"Duration": 1700000
}
],
"Phonemes": [
{
"Phoneme": "j",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "j",
"Score": 100
}
]
},
"Offset": 2700000,
"Duration": 500000
},
{
"Phoneme": "ə",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "ə",
"Score": 100
}
]
},
"Offset": 3300000,
"Duration": 400000
},
{
"Phoneme": "r",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "r",
"Score": 100
}
]
},
"Offset": 3800000,
"Duration": 600000
}
]
},
{
"Word": "father-in-law",
"Offset": 4500000,
"Duration": 9500000,
"Confidence": 0,
"PronunciationAssessment": {
"AccuracyScore": 45,
"ErrorType": "Mispronunciation"
},
"Syllables": [
{
"Syllable": "fɔ",
"Grapheme": "fa",
"PronunciationAssessment": {
"AccuracyScore": 25
},
"Offset": 4500000,
"Duration": 2100000
},
{
"Syllable": "ðə",
"Grapheme": "ther",
"PronunciationAssessment": {
"AccuracyScore": 64
},
"Offset": 6700000,
"Duration": 1300000
},
{
"Syllable": "rɪn",
"Grapheme": "in",
"PronunciationAssessment": {
"AccuracyScore": 100
},
"Offset": 8100000,
"Duration": 1900000
},
{
"Syllable": "lɔ",
"Grapheme": "law",
"PronunciationAssessment": {
"AccuracyScore": 100
},
"Offset": 10100000,
"Duration": 3900000
}
],
"Phonemes": [
{
"Phoneme": "f",
"PronunciationAssessment": {
"AccuracyScore": 69,
"NBestPhonemes": [
{
"Phoneme": "f",
"Score": 100
}
]
},
"Offset": 4500000,
"Duration": 500000
},
{
"Phoneme": "ɔ",
"PronunciationAssessment": {
"AccuracyScore": 8,
"NBestPhonemes": [
{
"Phoneme": "ɑ",
"Score": 100
}
]
},
"Offset": 5100000,
"Duration": 1500000
},
{
"Phoneme": "ð",
"PronunciationAssessment": {
"AccuracyScore": 28,
"NBestPhonemes": [
{
"Phoneme": "ð",
"Score": 100
}
]
},
"Offset": 6700000,
"Duration": 600000
},
{
"Phoneme": "ə",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "ə",
"Score": 100
}
]
},
"Offset": 7400000,
"Duration": 600000
},
{
"Phoneme": "r",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "r",
"Score": 100
}
]
},
"Offset": 8100000,
"Duration": 500000
},
{
"Phoneme": "ɪ",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "ɪ",
"Score": 100
}
]
},
"Offset": 8700000,
"Duration": 500000
},
{
"Phoneme": "n",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "n",
"Score": 100
}
]
},
"Offset": 9300000,
"Duration": 700000
},
{
"Phoneme": "l",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "l",
"Score": 100
}
]
},
"Offset": 10100000,
"Duration": 700000
},
{
"Phoneme": "ɔ",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "ɔ",
"Score": 100
}
]
},
"Offset": 10900000,
"Duration": 3100000
}
]
},
{
"Word": "isn't",
"Offset": 14300000,
"Duration": 3800000,
"Confidence": 0,
"PronunciationAssessment": {
"AccuracyScore": 100,
"ErrorType": "None"
},
"Syllables": [
{
"Syllable": "ɪ",
"PronunciationAssessment": {
"AccuracyScore": 100
},
"Offset": 14300000,
"Duration": 1700000
},
{
"Syllable": "zənt",
"PronunciationAssessment": {
"AccuracyScore": 100
},
"Offset": 16100000,
"Duration": 2000000
}
],
"Phonemes": [
{
"Phoneme": "ɪ",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "ɪ",
"Score": 100
}
]
},
"Offset": 14300000,
"Duration": 1700000
},
{
"Phoneme": "z",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "z",
"Score": 100
}
]
},
"Offset": 16100000,
"Duration": 700000
},
{
"Phoneme": "ə",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "ə",
"Score": 100
}
]
},
"Offset": 16900000,
"Duration": 500000
},
{
"Phoneme": "n",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "n",
"Score": 100
}
]
},
"Offset": 17500000,
"Duration": 200000
},
{
"Phoneme": "t",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "t",
"Score": 100
}
]
},
"Offset": 17800000,
"Duration": 300000
}
]
},
{
"Word": "he",
"Offset": 18200000,
"Duration": 2600000,
"Confidence": 0,
"PronunciationAssessment": {
"AccuracyScore": 100,
"ErrorType": "None"
},
"Syllables": [
{
"Syllable": "hi",
"Grapheme": "he",
"PronunciationAssessment": {
"AccuracyScore": 100
},
"Offset": 18200000,
"Duration": 2600000
}
],
"Phonemes": [
{
"Phoneme": "h",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "h",
"Score": 100
}
]
},
"Offset": 18200000,
"Duration": 400000
},
{
"Phoneme": "i",
"PronunciationAssessment": {
"AccuracyScore": 100,
"NBestPhonemes": [
{
"Phoneme": "i",
"Score": 100
}
]
},
"Offset": 18700000,
"Duration": 2100000
}
]
}
]