We are trying out Speech to text offering from Azure and need word level timestamps to create VTT out of audio. Although when using SDK we get word level timestamps, but when using API v3.0 that is not the case. Below are the SDK examples and API, we are using Python SDK
'RecognitionStatus': 'Success',
'Offset': 154600000,
'Duration': 77900000,
'DisplayText': "I bet those men are gonna get quite a reception when they get better. Yes, I'll be so glad when they land back now, but I think that's",
'NBest': [{'Confidence': 0.6943235,
'Lexical': "i bet those men are gonna get quite a reception when they get better yes i'll be so glad when they land back now but i think that's",
'ITN': "i bet those men are gonna get quite a reception when they get better yes i'll be so glad when they land back now but i think that's",
'MaskedITN': "i bet those men are gonna get quite a reception when they get better yes i'll be so glad when they land back now but i think that's",
'Display': "I bet those men are gonna get quite a reception when they get better. Yes, I'll be so glad when they land back now, but I think that's",
'Words': [{'Word': 'i', 'Offset': 154600000, 'Duration': 1900000},
{'Word': 'bet', 'Offset': 156600000, 'Duration': 2100000},
{'Word': 'those', 'Offset': 158800000, 'Duration': 3300000},
{'Word': 'men', 'Offset': 162200000, 'Duration': 1900000},
{'Word': 'are', 'Offset': 164200000, 'Duration': 1100000},
{'Word': 'gonna', 'Offset': 165400000, 'Duration': 2300000},
{'Word': 'get', 'Offset': 167800000, 'Duration': 1700000},
{'Word': 'quite', 'Offset': 169600000, 'Duration': 2300000},
{'Word': 'a', 'Offset': 172000000, 'Duration': 500000},
{'Word': 'reception', 'Offset': 172600000, 'Duration': 6300000},
{'Word': 'when', 'Offset': 179000000, 'Duration': 1700000},
{'Word': 'they', 'Offset': 180800000, 'Duration': 1300000},
{'Word': 'get', 'Offset': 182200000, 'Duration': 2100000},
{'Word': 'better', 'Offset': 184400000, 'Duration': 4800000},
{'Word': 'yes', 'Offset': 189300000, 'Duration': 4600000},
{'Word': "i'll", 'Offset': 194000000, 'Duration': 1400000},
{'Word': 'be', 'Offset': 195500000, 'Duration': 1400000},
{'Word': 'so', 'Offset': 197000000, 'Duration': 1700000},
{'Word': 'glad', 'Offset': 198800000, 'Duration': 3300000},
{'Word': 'when', 'Offset': 202200000, 'Duration': 1700000},
{'Word': 'they', 'Offset': 204000000, 'Duration': 900000},
{'Word': 'land', 'Offset': 205000000, 'Duration': 4000000},
{'Word': 'back', 'Offset': 209100000, 'Duration': 3400000},
{'Word': 'now', 'Offset': 212600000, 'Duration': 3300000},
{'Word': 'but', 'Offset': 216000000, 'Duration': 2100000},
{'Word': 'i', 'Offset': 221600000, 'Duration': 1900000},
{'Word': 'think', 'Offset': 223600000, 'Duration': 2700000},
{'Word': "that's", 'Offset': 226400000, 'Duration': 6100000}]},
{'Confidence': 0.6926312,
'Lexical': "i bet those men are going to get quite a reception when they get better yes i'll be so glad when they land back now but i think that's",
'ITN': "i bet those men are going to get quite a reception when they get better yes i'll be so glad when they land back now but i think that's",
'MaskedITN': "i bet those men are going to get quite a reception when they get better yes i'll be so glad when they land back now but i think that's",
'Display': "i bet those men are going to get quite a reception when they get better yes i'll be so glad when they land back now but i think that's",
'Words': [{'Word': 'i', 'Offset': 154600000, 'Duration': 1900000},
{'Word': 'bet', 'Offset': 156600000, 'Duration': 2100000},
{'Word': 'those', 'Offset': 158800000, 'Duration': 3300000},
{'Word': 'men', 'Offset': 162200000, 'Duration': 1900000},
{'Word': 'are', 'Offset': 164200000, 'Duration': 1100000},
{'Word': 'going', 'Offset': 165400000, 'Duration': 1500000},
{'Word': 'to', 'Offset': 167000000, 'Duration': 700000},
{'Word': 'get', 'Offset': 167800000, 'Duration': 1700000},
{'Word': 'quite', 'Offset': 169600000, 'Duration': 2300000},
{'Word': 'a', 'Offset': 172000000, 'Duration': 500000},
{'Word': 'reception', 'Offset': 172600000, 'Duration': 6300000},
{'Word': 'when', 'Offset': 179000000, 'Duration': 1700000},
{'Word': 'they', 'Offset': 180800000, 'Duration': 1300000},
{'Word': 'get', 'Offset': 182200000, 'Duration': 2100000},
{'Word': 'better', 'Offset': 184400000, 'Duration': 4800000},
{'Word': 'yes', 'Offset': 189300000, 'Duration': 4600000},
{'Word': "i'll", 'Offset': 194000000, 'Duration': 1400000},
{'Word': 'be', 'Offset': 195500000, 'Duration': 1400000},
{'Word': 'so', 'Offset': 197000000, 'Duration': 1700000},
{'Word': 'glad', 'Offset': 198800000, 'Duration': 3300000},
{'Word': 'when', 'Offset': 202200000, 'Duration': 1700000},
{'Word': 'they', 'Offset': 204000000, 'Duration': 900000},
{'Word': 'land', 'Offset': 205000000, 'Duration': 4000000},
{'Word': 'back', 'Offset': 209100000, 'Duration': 3400000},
{'Word': 'now', 'Offset': 212600000, 'Duration': 3300000},
{'Word': 'but', 'Offset': 216000000, 'Duration': 2100000},
{'Word': 'i', 'Offset': 221600000, 'Duration': 1900000},
{'Word': 'think', 'Offset': 223600000, 'Duration': 2700000},
{'Word': "that's", 'Offset': 226400000, 'Duration': 6100000}]},
{'Confidence': 0.69729567,
'Lexical': "i bet those men are gonna get quite a reception when they get better oh yes i'll be so glad when they land back now but i think that's",
'ITN': "i bet those men are gonna get quite a reception when they get better oh yes i'll be so glad when they land back now but i think that's",
'MaskedITN': "i bet those men are gonna get quite a reception when they get better oh yes i'll be so glad when they land back now but i think that's",
'Display': "i bet those men are gonna get quite a reception when they get better oh yes i'll be so glad when they land back now but i think that's",
'Words': [{'Word': 'i', 'Offset': 154600000, 'Duration': 1900000},
{'Word': 'bet', 'Offset': 156600000, 'Duration': 2100000},
{'Word': 'those', 'Offset': 158800000, 'Duration': 3300000},
{'Word': 'men', 'Offset': 162200000, 'Duration': 1900000},
{'Word': 'are', 'Offset': 164200000, 'Duration': 1100000},
{'Word': 'gonna', 'Offset': 165400000, 'Duration': 2300000},
{'Word': 'get', 'Offset': 167800000, 'Duration': 1700000},
{'Word': 'quite', 'Offset': 169600000, 'Duration': 2300000},
{'Word': 'a', 'Offset': 172000000, 'Duration': 500000},
{'Word': 'reception', 'Offset': 172600000, 'Duration': 6300000},
{'Word': 'when', 'Offset': 179000000, 'Duration': 1700000},
{'Word': 'they', 'Offset': 180800000, 'Duration': 1300000},
{'Word': 'get', 'Offset': 182200000, 'Duration': 2100000},
{'Word': 'better', 'Offset': 184400000, 'Duration': 2700000},
{'Word': 'oh', 'Offset': 187200000, 'Duration': 1900000},
{'Word': 'yes', 'Offset': 189200000, 'Duration': 4700000},
{'Word': "i'll", 'Offset': 194000000, 'Duration': 1300000},
{'Word': 'be', 'Offset': 195400000, 'Duration': 1500000},
{'Word': 'so', 'Offset': 197000000, 'Duration': 1700000},
{'Word': 'glad', 'Offset': 198800000, 'Duration': 3300000},
{'Word': 'when', 'Offset': 202200000, 'Duration': 1700000},
{'Word': 'they', 'Offset': 204000000, 'Duration': 900000},
{'Word': 'land', 'Offset': 205000000, 'Duration': 4000000},
{'Word': 'back', 'Offset': 209100000, 'Duration': 3400000},
{'Word': 'now', 'Offset': 212600000, 'Duration': 3300000},
{'Word': 'but', 'Offset': 216000000, 'Duration': 2100000},
{'Word': 'i', 'Offset': 221600000, 'Duration': 1900000},
{'Word': 'think', 'Offset': 223600000, 'Duration': 2700000},
{'Word': "that's", 'Offset': 226400000, 'Duration': 6100000}]},
{'Confidence': 0.6956034,
'Lexical': "i bet those men are going to get quite a reception when they get better oh yes i'll be so glad when they land back now but i think that's",
'ITN': "i bet those men are going to get quite a reception when they get better oh yes i'll be so glad when they land back now but i think that's",
'MaskedITN': "i bet those men are going to get quite a reception when they get better oh yes i'll be so glad when they land back now but i think that's",
'Display': "i bet those men are going to get quite a reception when they get better oh yes i'll be so glad when they land back now but i think that's",
'Words': [{'Word': 'i', 'Offset': 154600000, 'Duration': 1900000},
{'Word': 'bet', 'Offset': 156600000, 'Duration': 2100000},
{'Word': 'those', 'Offset': 158800000, 'Duration': 3300000},
{'Word': 'men', 'Offset': 162200000, 'Duration': 1900000},
{'Word': 'are', 'Offset': 164200000, 'Duration': 1100000},
{'Word': 'going', 'Offset': 165400000, 'Duration': 1500000},
{'Word': 'to', 'Offset': 167000000, 'Duration': 700000},
{'Word': 'get', 'Offset': 167800000, 'Duration': 1700000},
{'Word': 'quite', 'Offset': 169600000, 'Duration': 2300000},
{'Word': 'a', 'Offset': 172000000, 'Duration': 500000},
{'Word': 'reception', 'Offset': 172600000, 'Duration': 6300000},
{'Word': 'when', 'Offset': 179000000, 'Duration': 1700000},
{'Word': 'they', 'Offset': 180800000, 'Duration': 1300000},
{'Word': 'get', 'Offset': 182200000, 'Duration': 2100000},
{'Word': 'better', 'Offset': 184400000, 'Duration': 2700000},
{'Word': 'oh', 'Offset': 187200000, 'Duration': 1900000},
{'Word': 'yes', 'Offset': 189200000, 'Duration': 4700000},
{'Word': "i'll", 'Offset': 194000000, 'Duration': 1300000},
{'Word': 'be', 'Offset': 195400000, 'Duration': 1500000},
{'Word': 'so', 'Offset': 197000000, 'Duration': 1700000},
{'Word': 'glad', 'Offset': 198800000, 'Duration': 3300000},
{'Word': 'when', 'Offset': 202200000, 'Duration': 1700000},
{'Word': 'they', 'Offset': 204000000, 'Duration': 900000},
{'Word': 'land', 'Offset': 205000000, 'Duration': 4000000},
{'Word': 'back', 'Offset': 209100000, 'Duration': 3400000},
{'Word': 'now', 'Offset': 212600000, 'Duration': 3300000},
{'Word': 'but', 'Offset': 216000000, 'Duration': 2100000},
{'Word': 'i', 'Offset': 221600000, 'Duration': 1900000},
{'Word': 'think', 'Offset': 223600000, 'Duration': 2700000},
{'Word': "that's", 'Offset': 226400000, 'Duration': 6100000}]}
{
"recognitionStatus": "Success",
"channel": 0,
"speaker": 2,
"offset": "PT15.64S",
"duration": "PT2.79S",
"offsetInTicks": 156400000.0,
"durationInTicks": 27900000.0,
"nBest": [
{
"confidence": 0.6904604,
"lexical": "said those men are gonna get quite a reception when they get",
"itn": "said those men are gonna get quite a reception when they get",
"maskedITN": "said those men are gonna get quite a reception when they get",
"display": "Said those men are gonna get quite a reception when they get.",
"words": [
{
"word": "said",
"offset": "PT15.64S",
"duration": "PT0.23S",
"offsetInTicks": 156400000.0,
"durationInTicks": 2300000.0,
"confidence": 0.08547592
},
{
"word": "those",
"offset": "PT15.88S",
"duration": "PT0.33S",
"offsetInTicks": 158800000.0,
"durationInTicks": 3300000.0,
"confidence": 0.27528304
},
{
"word": "men",
"offset": "PT16.22S",
"duration": "PT0.19S",
"offsetInTicks": 162200000.0,
"durationInTicks": 1900000.0,
"confidence": 0.7574552
},
{
"word": "are",
"offset": "PT16.42S",
"duration": "PT0.11S",
"offsetInTicks": 164200000.0,
"durationInTicks": 1100000.0,
"confidence": 0.84245914
},
{
"word": "gonna",
"offset": "PT16.54S",
"duration": "PT0.23S",
"offsetInTicks": 165400000.0,
"durationInTicks": 2300000.0,
"confidence": 0.58642036
},
{
"word": "get",
"offset": "PT16.78S",
"duration": "PT0.17S",
"offsetInTicks": 167800000.0,
"durationInTicks": 1700000.0,
"confidence": 0.91722536
},
{
"word": "quite",
"offset": "PT16.96S",
"duration": "PT0.23S",
"offsetInTicks": 169600000.0,
"durationInTicks": 2300000.0,
"confidence": 0.5793023
},
{
"word": "a",
"offset": "PT17.2S",
"duration": "PT0.05S",
"offsetInTicks": 172000000.0,
"durationInTicks": 500000.0,
"confidence": 0.75620925
},
{
"word": "reception",
"offset": "PT17.26S",
"duration": "PT0.63S",
"offsetInTicks": 172600000.0,
"durationInTicks": 6300000.0,
"confidence": 0.75127023
},
{
"word": "when",
"offset": "PT17.9S",
"duration": "PT0.17S",
"offsetInTicks": 179000000.0,
"durationInTicks": 1700000.0,
"confidence": 0.90499866
},
{
"word": "they",
"offset": "PT18.08S",
"duration": "PT0.13S",
"offsetInTicks": 180800000.0,
"durationInTicks": 1300000.0,
"confidence": 0.98178667
},
{
"word": "get",
"offset": "PT18.22S",
"duration": "PT0.21S",
"offsetInTicks": 182200000.0,
"durationInTicks": 2100000.0,
"confidence": 0.9517029
}
]
}
]
},