Hi, i was running a dual channel hindi audio file on speech studio as well as api on batch speech to text, but i am getting different results in both, and it is consistent. The speech studio version is the correct one as it recognises the phrases separately in a channel, while in the api( for which i took the parameters from speech studio as well) it is giving all the recognised phrases together instead of separate, for example, in a conversation: Channel_1: "Hello" Channel_0: "Hi" Channel_1: "How are you" Channel_0: "Im good thanks". It comes like this in speech studio but in api, it comes as: Channel_1: "Hello How are you" Channel_0: "Hi Im good thanks". I tried to combine it with the offset but still not working, the results are clearly different for speecg studio and the api. Also i tried with both, v3.2 as well as the new api-version=2024-11-15.
Speech studio output-
"recognizedPhrases": [
{
"recognitionStatus": "Success",
"channel": 0,
"offset": "PT1.71S",
"duration": "PT0.36S",
"offsetInTicks": 17100000,
"durationInTicks": 3600000,
"durationMilliseconds": 360,
"offsetMilliseconds": 1710,
"nBest": [
{
"confidence": 0.6302351,
"lexical": "हैलो",
"itn": "हैलो",
"maskedITN": "हैलो।",
"display": "हैलो।",
"words": [
{
"word": "हैलो",
"offset": "PT1.71S",
"duration": "PT0.36S",
"offsetInTicks": 17100000,
"durationInTicks": 3600000,
"durationMilliseconds": 360,
"offsetMilliseconds": 1710,
"confidence": 0.6302351
}
],
"displayWords": [
{
"displayText": "हैलो।",
"offset": "PT1.71S",
"duration": "PT0.36S",
"offsetInTicks": 17100000,
"durationInTicks": 3600000,
"durationMilliseconds": 360,
"offsetMilliseconds": 1710
}
]
},
{
"confidence": 0.6302351,
"lexical": "हैलो",
"itn": "हैलो",
"maskedITN": "हैलो",
"display": "हैलो",
"words": [
{
"word": "हैलो",
"offset": "PT1.71S",
"duration": "PT0.36S",
"offsetInTicks": 17100000,
"durationInTicks": 3600000,
"durationMilliseconds": 360,
"offsetMilliseconds": 1710,
"confidence": 0.6302351
}
]
},
{
"confidence": 0.6302351,
"lexical": "हैलो",
"itn": "हैलो",
"maskedITN": "हैलो",
"display": "हैलो",
"words": [
{
"word": "हैलो",
"offset": "PT1.71S",
"duration": "PT0.36S",
"offsetInTicks": 17100000,
"durationInTicks": 3600000,
"durationMilliseconds": 360,
"offsetMilliseconds": 1710,
"confidence": 0.6302351
}
]
},
{
"confidence": 0.6302351,
"lexical": "हैलो",
"itn": "हैलो",
"maskedITN": "हैलो",
"display": "हैलो",
"words": [
{
"word": "हैलो",
"offset": "PT0.23S",
"duration": "PT0.16S",
"offsetInTicks": 2300000,
"durationInTicks": 1600000,
"durationMilliseconds": 160,
"offsetMilliseconds": 230,
"confidence": 0.6302351
}
]
},
{
"confidence": 0.37863052,
"lexical": "हैलो हैलो",
"itn": "हैलो हैलो",
"maskedITN": "हैलो हैलो",
"display": "हैलो हैलो",
"words": [
{
"word": "हैलो",
"offset": "PT0.23S",
"duration": "PT0.16S",
"offsetInTicks": 2300000,
"durationInTicks": 1600000,
"durationMilliseconds": 160,
"offsetMilliseconds": 230,
"confidence": 0.0028616383
},
{
"word": "हैलो",
"offset": "PT1.71S",
"duration": "PT0.36S",
"offsetInTicks": 17100000,
"durationInTicks": 3600000,
"durationMilliseconds": 360,
"offsetMilliseconds": 1710,
"confidence": 0.7543994
}
]
}
]
}
Api output -
"recognizedPhrases": [
{
"recognitionStatus": "Success",
"channel": 0,
"offset": "PT1.72S",
"duration": "PT14.72S",
"offsetInTicks": 17200000,
"durationInTicks": 147200000,
"durationMilliseconds": 14720,
"offsetMilliseconds": 1720,
"nBest": [
{
"confidence": 0.5169891,
"lexical": "हैलो हैलो जी जी हैलो मैडम हैलो",
"itn": "हैलो हैलो जी जी हैलो मैडम हैलो",
"maskedITN": "हैलो हैलो जी जी हैलो मैडम हैलो",
"display": "हैलो हैलो जी जी हैलो मैडम हैलो?",
"displayWords": [
{
"displayText": "हैलो",
"offset": "PT1.72S",
"duration": "PT0.32S",
"offsetInTicks": 17200000,
"durationInTicks": 3200000,
"durationMilliseconds": 320,
"offsetMilliseconds": 1720
},
{
"displayText": "हैलो",
"offset": "PT4.52S",
"duration": "PT0.52S",
"offsetInTicks": 45200000,
"durationInTicks": 5200000,
"durationMilliseconds": 520,
"offsetMilliseconds": 4520
},
{
"displayText": "जी",
"offset": "PT8.12S",
"duration": "PT0.48S",
"offsetInTicks": 81200000,
"durationInTicks": 4800000,
"durationMilliseconds": 480,
"offsetMilliseconds": 8120
},
{
"displayText": "जी",
"offset": "PT11.28S",
"duration": "PT0.44S",
"offsetInTicks": 112800000,
"durationInTicks": 4400000,
"durationMilliseconds": 440,
"offsetMilliseconds": 11280
},
{
"displayText": "हैलो",
"offset": "PT14.32S",
"duration": "PT0.52S",
"offsetInTicks": 143200000,
"durationInTicks": 5200000,
"durationMilliseconds": 520,
"offsetMilliseconds": 14320
},
{
"displayText": "मैडम",
"offset": "PT14.84S",
"duration": "PT0.24S",
"offsetInTicks": 148400000,
"durationInTicks": 2400000,
"durationMilliseconds": 240,
"offsetMilliseconds": 14840
},
{
"displayText": "हैलो?",
"offset": "PT15.92S",
"duration": "PT0.52S",
"offsetInTicks": 159200000,
"durationInTicks": 5200000,
"durationMilliseconds": 520,
"offsetMilliseconds": 15920
}
]
}