azure prononciation assessment

Iheb Jandoubi 25

In azure prononciation assessment for scripted speech , why i insert a word that does not in exist in the script in my speech why i don't get that word as inserted in the result of prononciation assessment?

YutongTie-MSFT 52,596

Hello Ihed,

Do you mean you are looking at the error type in the word level but it shows none? Given below as an example in the error type -

Could you please share the input context and your JSON result ?

Thanks.

{

"Id": "bbb42ea51bdb46d19a1d685e635fe173",

"RecognitionStatus": 0,

"Offset": 7500000,

"Duration": 13800000,

"DisplayText": "Hello.",

"NBest": [

    {

        "Confidence": 0.975003,

        "Lexical": "hello",

        "ITN": "hello",

        "MaskedITN": "hello",

        "Display": "Hello.",

        "PronunciationAssessment": {

            "AccuracyScore": 100,

            "FluencyScore": 100,

            "CompletenessScore": 100,

            "PronScore": 100

        },

        "Words": [

            {

                "Word": "hello",

                "Offset": 7500000,

                "Duration": 13800000,

                "PronunciationAssessment": {

                    "AccuracyScore": 99.0,

                    **"ErrorType": "None"**

                },

                "Syllables": [

                    {

                        "Syllable": "hɛ",

                        "PronunciationAssessment": {

                            "AccuracyScore": 91.0

                        },

                        "Offset": 7500000,

                        "Duration": 4100000

                    },

                    {

                        "Syllable": "loʊ",

                        "PronunciationAssessment": {

                            "AccuracyScore": 100.0

                        },

                        "Offset": 11700000,

                        "Duration": 9600000

                    }

                ],

                "Phonemes": [

                    {

                        "Phoneme": "h",

                        "PronunciationAssessment": {

                            "AccuracyScore": 98.0,

                            "NBestPhonemes": [

                                {

                                    "Phoneme": "h",

                                    "Score": 100.0

                                },

                                {

                                    "Phoneme": "oʊ",

                                    "Score": 52.0

                                },

                                {

                                    "Phoneme": "ə",

                                    "Score": 35.0

                                },

                                {

                                    "Phoneme": "k",

                                    "Score": 23.0

                                },

                                {

                                    "Phoneme": "æ",

                                    "Score": 20.0

                                }

                            ]

                        },

                        "Offset": 7500000,

                        "Duration": 3500000

                    },

                    {

                        "Phoneme": "ɛ",

                        "PronunciationAssessment": {

                            "AccuracyScore": 47.0,

                            "NBestPhonemes": [

                                {

                                    "Phoneme": "ə",

                                    "Score": 100.0

                                },

                                {

                                    "Phoneme": "l",

                                    "Score": 52.0

                                },

                                {

                                    "Phoneme": "ɛ",

                                    "Score": 47.0

                                },

                                {

                                    "Phoneme": "h",

                                    "Score": 17.0

                                },

                                {

                                    "Phoneme": "æ",

                                    "Score": 2.0

                                }

                            ]

                        },

                        "Offset": 11100000,

                        "Duration": 500000

                    },

                    {

                        "Phoneme": "l",

                        "PronunciationAssessment": {

                            "AccuracyScore": 100.0,

                            "NBestPhonemes": [

                                {

                                    "Phoneme": "l",

                                    "Score": 100.0

                                },

                                {

                                    "Phoneme": "oʊ",

                                    "Score": 46.0

                                },

                                {

                                    "Phoneme": "ə",

                                    "Score": 5.0

                                },

                                {

                                    "Phoneme": "ɛ",

                                    "Score": 3.0

                                },

                                {

                                    "Phoneme": "u",

                                    "Score": 1.0

                                }

                            ]

                        },

                        "Offset": 11700000,

                        "Duration": 1100000

                    },

                    {

                        "Phoneme": "oʊ",

                        "PronunciationAssessment": {

                            "AccuracyScore": 100.0,

                            "NBestPhonemes": [

                                {

                                    "Phoneme": "oʊ",

                                    "Score": 100.0

                                },

                                {

                                    "Phoneme": "d",

                                    "Score": 29.0

                                },

                                {

                                    "Phoneme": "t",

                                    "Score": 24.0

                                },

                                {

                                    "Phoneme": "n",

                                    "Score": 22.0

                                },

                                {

                                    "Phoneme": "l",

                                    "Score": 18.0

                                }

                            ]

                        },

                        "Offset": 12900000,

                        "Duration": 8400000

                    }

                ]

            }

        ]

    }

]

}

Iheb Jandoubi 25

@YutongTie-MSFT let's take the following script : "life is beautiful" in the speech the user said "life is magnificient" in the json result in the word level : i don't get magnificient error INSERTED and i get beautiful error : omnission (which is obivious) this is the json :

[
    {
        "Id": "1463e36a1cf84134b5658939a3d267ba",
        "RecognitionStatus": 0,
        "Offset": 3600000,
        "Duration": 14400000,
        "Channel": 0,
        "DisplayText": "Life is beautiful.",
        "SNR": 34.084476,
        "NBest": [
            {
                "Confidence": 0.8474877,
                "Lexical": "life is beautiful",
                "ITN": "life is beautiful",
                "MaskedITN": "life is beautiful",
                "Display": "Life is beautiful.",
                "PronunciationAssessment": {
                    "AccuracyScore": 63,
                    "FluencyScore": 51,
                    "ProsodyScore": 69.9,
                    "CompletenessScore": 67,
                    "PronScore": 60.4
                },
                "Words": [
                    {
                        "Word": "life",
                        "Offset": 3600000,
                        "Duration": 5000000,
                        "PronunciationAssessment": {
                            "AccuracyScore": 100,
                            "ErrorType": "None",
                            "Feedback": {
                                "Prosody": {
                                    "Break": {
                                        "ErrorTypes": [
                                            "None"
                                        ],
                                        "BreakLength": 0
                                    },
                                    "Intonation": {
                                        "ErrorTypes": [],
                                        "Monotone": {
                                            "SyllablePitchDeltaConfidence": 0.343404
                                        }
                                    }
                                }
                            }
                        },
                        "Syllables": [
                            {
                                "Syllable": "laɪf",
                                "Grapheme": "life",
                                "PronunciationAssessment": {
                                    "AccuracyScore": 100
                                },
                                "Offset": 3600000,
                                "Duration": 5000000
                            }
                        ],
                        "Phonemes": [
                            {
                                "Phoneme": "l",
                                "PronunciationAssessment": {
                                    "AccuracyScore": 100,
                                    "NBestPhonemes": [
                                        {
                                            "Phoneme": "l",
                                            "Score": 100
                                        },
                                        {
                                            "Phoneme": "aɪ",
                                            "Score": 24
                                        },
                                        {
                                            "Phoneme": "n",
                                            "Score": 5
                                        },
                                        {
                                            "Phoneme": "r",
                                            "Score": 3
                                        },
                                        {
                                            "Phoneme": "ɪ",
                                            "Score": 3
                                        }
                                    ]
                                },
                                "Offset": 3600000,
                                "Duration": 2700000
                            },
                            {
                                "Phoneme": "aɪ",
                                "PronunciationAssessment": {
                                    "AccuracyScore": 100,
                                    "NBestPhonemes": [
                                        {
                                            "Phoneme": "aɪ",
                                            "Score": 100
                                        },
                                        {
                                            "Phoneme": "f",
                                            "Score": 51
                                        },
                                        {
                                            "Phoneme": "æ",
                                            "Score": 5
                                        },
                                        {
                                            "Phoneme": "ɛ",
                                            "Score": 2
                                        },
                                        {
                                            "Phoneme": "eɪ",
                                            "Score": 1
                                        }
                                    ]
                                },
                                "Offset": 6400000,
                                "Duration": 1100000
                            },
                            {
                                "Phoneme": "f",
                                "PronunciationAssessment": {
                                    "AccuracyScore": 100,
                                    "NBestPhonemes": [
                                        {
                                            "Phoneme": "f",
                                            "Score": 100
                                        },
                                        {
                                            "Phoneme": "ɪ",
                                            "Score": 29
                                        },
                                        {
                                            "Phoneme": "v",
                                            "Score": 4
                                        },
                                        {
                                            "Phoneme": "aɪ",
                                            "Score": 4
                                        },
                                        {
                                            "Phoneme": "i",
                                            "Score": 2
                                        }
                                    ]
                                },
                                "Offset": 7600000,
                                "Duration": 1000000
                            }
                        ]
                    },
                    {
                        "Word": "is",
                        "Offset": 8700000,
                        "Duration": 2200000,
                        "PronunciationAssessment": {
                            "AccuracyScore": 86,
                            "ErrorType": "None",
                            "Feedback": {
                                "Prosody": {
                                    "Break": {
                                        "ErrorTypes": [
                                            "None"
                                        ],
                                        "UnexpectedBreak": {
                                            "Confidence": 3.448276e-8
                                        },
                                        "MissingBreak": {
                                            "Confidence": 1
                                        },
                                        "BreakLength": 0
                                    },
                                    "Intonation": {
                                        "ErrorTypes": [],
                                        "Monotone": {
                                            "SyllablePitchDeltaConfidence": 0.343404
                                        }
                                    }
                                }
                            }
                        },
                        "Syllables": [
                            {
                                "Syllable": "ɪz",
                                "Grapheme": "is",
                                "PronunciationAssessment": {
                                    "AccuracyScore": 78
                                },
                                "Offset": 8700000,
                                "Duration": 2200000
                            }
                        ],
                        "Phonemes": [
                            {
                                "Phoneme": "ɪ",
                                "PronunciationAssessment": {
                                    "AccuracyScore": 83,
                                    "NBestPhonemes": [
                                        {
                                            "Phoneme": "ɪ",
                                            "Score": 100
                                        },
                                        {
                                            "Phoneme": "z",
                                            "Score": 78
                                        },
                                        {
                                            "Phoneme": "f",
                                            "Score": 31
                                        },
                                        {
                                            "Phoneme": "i",
                                            "Score": 7
                                        },
                                        {
                                            "Phoneme": "h",
                                            "Score": 1
                                        }
                                    ]
                                },
                                "Offset": 8700000,
                                "Duration": 1400000
                            },
                            {
                                "Phoneme": "z",
                                "PronunciationAssessment": {
                                    "AccuracyScore": 69,
                                    "NBestPhonemes": [
                                        {
                                            "Phoneme": "z",
                                            "Score": 100
                                        },
                                        {
                                            "Phoneme": "m",
                                            "Score": 74
                                        },
                                        {
                                            "Phoneme": "s",
                                            "Score": 2
                                        },
                                        {
                                            "Phoneme": "ə",
                                            "Score": 2
                                        },
                                        {
                                            "Phoneme": "n",
                                            "Score": 1
                                        }
                                    ]
                                },
                                "Offset": 10200000,
                                "Duration": 700000
                            }
                        ]
                    },
                    {
                        "Word": "beautiful",
                        "Offset": 11000000,
                        "Duration": 7000000,
                        "PronunciationAssessment": {
                            "AccuracyScore": 3,
                            "ErrorType": "Omnission",

YutongTie-MSFT 52,596 Reputation points

2024-07-28T00:39:06.03+00:00

Thanks for your example, we will discuss this case with product team to see how the systems will deal with it.

Share via

azure prononciation assessment

Your answer