azure prononciation assessment

Question

In azure prononciation assessment for scripted speech , why when i insert a word that does not in exist in the script in my speech i don't get that word as inserted in the result of prononciation assessment?

let's take the following script : "life is beautiful" in the speech the user said "life is magnificient" in the json result in the word level : i don't get magnificient error INSERTED and i get beautiful error : omnission (which is obivious) this is the json :

[

{

    "Id": "1463e36a1cf84134b5658939a3d267ba",

    "RecognitionStatus": 0,

    "Offset": 3600000,

    "Duration": 14400000,

    "Channel": 0,

    "DisplayText": "Life is beautiful.",

    "SNR": 34.084476,

    "NBest": [

        {

            "Confidence": 0.8474877,

            "Lexical": "life is beautiful",

            "ITN": "life is beautiful",

            "MaskedITN": "life is beautiful",

            "Display": "Life is beautiful.",

            "PronunciationAssessment": {

                "AccuracyScore": 63,

                "FluencyScore": 51,

                "ProsodyScore": 69.9,

                "CompletenessScore": 67,

                "PronScore": 60.4

            },

            "Words": [

                {

                    "Word": "life",

                    "Offset": 3600000,

                    "Duration": 5000000,

                    "PronunciationAssessment": {

                        "AccuracyScore": 100,

                        "ErrorType": "None",

                        "Feedback": {

                            "Prosody": {

                                "Break": {

                                    "ErrorTypes": [

                                        "None"

                                    ],

                                    "BreakLength": 0

                                },

                                "Intonation": {

                                    "ErrorTypes": [],

                                    "Monotone": {

                                        "SyllablePitchDeltaConfidence": 0.343404

                                    }

                                }

                            }

                        }

                    },

                    "Syllables": [

                        {

                            "Syllable": "laɪf",

                            "Grapheme": "life",

                            "PronunciationAssessment": {

                                "AccuracyScore": 100

                            },

                            "Offset": 3600000,

                            "Duration": 5000000

                        }

                    ],

                    "Phonemes": [

                        {

                            "Phoneme": "l",

                            "PronunciationAssessment": {

                                "AccuracyScore": 100,

                                "NBestPhonemes": [

                                    {

                                        "Phoneme": "l",

                                        "Score": 100

                                    },

                                    {

                                        "Phoneme": "aɪ",

                                        "Score": 24

                                    },

                                    {

                                        "Phoneme": "n",

                                        "Score": 5

                                    },

                                    {

                                        "Phoneme": "r",

                                        "Score": 3

                                    },

                                    {

                                        "Phoneme": "ɪ",

                                        "Score": 3

                                    }

                                ]

                            },

                            "Offset": 3600000,

                            "Duration": 2700000

                        },

                        {

                            "Phoneme": "aɪ",

                            "PronunciationAssessment": {

                                "AccuracyScore": 100,

                                "NBestPhonemes": [

                                    {

                                        "Phoneme": "aɪ",

                                        "Score": 100

                                    },

                                    {

                                        "Phoneme": "f",

                                        "Score": 51

                                    },

                                    {

                                        "Phoneme": "æ",

                                        "Score": 5

                                    },

                                    {

                                        "Phoneme": "ɛ",

                                        "Score": 2

                                    },

                                    {

                                        "Phoneme": "eɪ",

                                        "Score": 1

                                    }

                                ]

                            },

                            "Offset": 6400000,

                            "Duration": 1100000

                        },

                        {

                            "Phoneme": "f",

                            "PronunciationAssessment": {

                                "AccuracyScore": 100,

                                "NBestPhonemes": [

                                    {

                                        "Phoneme": "f",

                                        "Score": 100

                                    },

                                    {

                                        "Phoneme": "ɪ",

                                        "Score": 29

                                    },

                                    {

                                        "Phoneme": "v",

                                        "Score": 4

                                    },

                                    {

                                        "Phoneme": "aɪ",

                                        "Score": 4

                                    },

                                    {

                                        "Phoneme": "i",

                                        "Score": 2

                                    }

                                ]

                            },

                            "Offset": 7600000,

                            "Duration": 1000000

                        }

                    ]

                },

                {

                    "Word": "is",

                    "Offset": 8700000,

                    "Duration": 2200000,

                    "PronunciationAssessment": {

                        "AccuracyScore": 86,

                        "ErrorType": "None",

                        "Feedback": {

                            "Prosody": {

                                "Break": {

                                    "ErrorTypes": [

                                        "None"

                                    ],

                                    "UnexpectedBreak": {

                                        "Confidence": 3.448276e-8

                                    },

                                    "MissingBreak": {

                                        "Confidence": 1

                                    },

                                    "BreakLength": 0

                                },

                                "Intonation": {

                                    "ErrorTypes": [],

                                    "Monotone": {

                                        "SyllablePitchDeltaConfidence": 0.343404

                                    }

                                }

                            }

                        }

                    },

                    "Syllables": [

                        {

                            "Syllable": "ɪz",

                            "Grapheme": "is",

                            "PronunciationAssessment": {

                                "AccuracyScore": 78

                            },

                            "Offset": 8700000,

                            "Duration": 2200000

                        }

                    ],

                    "Phonemes": [

                        {

                            "Phoneme": "ɪ",

                            "PronunciationAssessment": {

                                "AccuracyScore": 83,

                                "NBestPhonemes": [

                                    {

                                        "Phoneme": "ɪ",

                                        "Score": 100

                                    },

                                    {

                                        "Phoneme": "z",

                                        "Score": 78

                                    },

                                    {

                                        "Phoneme": "f",

                                        "Score": 31

                                    },

                                    {

                                        "Phoneme": "i",

                                        "Score": 7

                                    },

                                    {

                                        "Phoneme": "h",

                                        "Score": 1

                                    }

                                ]

                            },

                            "Offset": 8700000,

                            "Duration": 1400000

                        },

                        {

                            "Phoneme": "z",

                            "PronunciationAssessment": {

                                "AccuracyScore": 69,

                                "NBestPhonemes": [

                                    {

                                        "Phoneme": "z",

                                        "Score": 100

                                    },

                                    {

                                        "Phoneme": "m",

                                        "Score": 74

                                    },

                                    {

                                        "Phoneme": "s",

                                        "Score": 2

                                    },

                                    {

                                        "Phoneme": "ə",

                                        "Score": 2

                                    },

                                    {

                                        "Phoneme": "n",

                                        "Score": 1

                                    }

                                ]

                            },

                            "Offset": 10200000,

                            "Duration": 700000

                        }

                    ]

                },

                {

                    "Word": "beautiful",

                    "Offset": 11000000,

                    "Duration": 7000000,

                    "PronunciationAssessment": {

                        "AccuracyScore": 3,

                        "ErrorType": "Omnission",

Accepted Answer

Hi @Iheb Jandoubi

Thank you for the question.

Azure Pronunciation Assessment for scripted speech focuses on comparing spoken words to the script, evaluating accuracy, fluency, prosody, and completeness. The system marks words from the script that are not spoken as omissions, but it does not flag additional words not in the script as errors. This approach prioritizes the detection of omitted and mispronounced words, ensuring that the spoken input matches the provided script. Consequently, words like "magnificent," which are not in the script, are not marked as inserted errors, simplifying the assessment to concentrate on script reliability of the spoken words rather than identifying every deviation.

To summarize, in scripted speech assessment, Azure Pronunciation Assessment prioritizes identifying omissions and pronunciation errors for the words in the script. It does not provide explicit errors for inserted words not in the script, hence why "magnificent" is not flagged separately.

I hope this helps. Thank you.

Please don't forget to click Accept Answer and Yes for was this answer helpful.

Share via

azure prononciation assessment

0 additional answers

Your answer