An Azure service that integrates speech processing into apps and services.
Facing a problem with conversation transcriber with long audio file: canceled event after transcribing 1600 seconds.
Hello, I am facing an issue with ConversationTranscriber method of speechsdk. It works for some audio files but there is one, of almost 2 hours in Japanese, where the event is canceled after transcribing ~1600 hours. When I checked the logs, I had this error :
[162383]: 1276687ms SPX_DBG_TRACE_VERBOSE: named_properties.h:479 ISpxPropertyBagImpl::SetStringValue: this=0x0x007ff9f80ea530; name='RESULT-ErrorDetails'; value='Due to service inactivity, the client buffer exceeded maximum size. Resetting the buffer.'[162383]: 1276692ms SPX_DBG_TRACE_VERBOSE: audio_stream_session.cpp:1397 [0xa23bda0]CSpxAudioStreamSession::WaitForRecognition_Complete: ...[162383]: 1276711ms SPX_DBG_TRACE_VERBOSE: audio_stream_session.cpp:1581 [0xa23bda0]CSpxAudioStreamSession::FireResultEvent[162383]: 1276724ms SPX_DBG_TRACE_VERBOSE: named_properties.h:479 ISpxNamedProperties::GetStringValue: this=0x0x007ff9f80ea530; name='RESULT-ErrorDetails'; value='Due to service inactivity, the client buffer exceeded maximum size. Resetting the buffer.'[382078]: 1276723ms SPX_DBG_TRACE_VERBOSE: audio_pump.cpp:446 [0xa06a880]CSpxAudioPump::PumpThread(): read frame duration: 9 ms => sending audio buffer size 6400[162383]: 1276733ms SPX_DBG_TRACE_VERBOSE: named_properties.h:479 ISpxPropertyBagImpl::SetStringValue: this=0x0x007ff9f80ea530; name='RESULT-ErrorDetails'; value='Due to service inactivity, the client buffer exceeded maximum size. Resetting the buffer. SessionId: b11e4d04e668435882fd7b33d438f897'[162383]: 1276741ms SPX_DBG_TRACE_VERBOSE: audio_stream_session.cpp:1472 [0xa23bda0]CSpxAudioStreamSession::FireSessionStoppedEvent: Firing SessionStopped event: SessionId: b11e4d04e668435882fd7b33d438f897[264420]: 1276741ms SPX_DBG_TRACE_SCOPE_ENTER: audio_stream_session.cpp:1724 DispatchEvent task started...[162383]: 1276748ms SPX_DBG_TRACE_SCOPE_ENTER: audio_stream_session.cpp:1560 EnsureFireResultEvent[162383]: 1276753ms SPX_DBG_TRACE_VERBOSE: audio_stream_session.cpp:1561 [0xa23bda0]CSpxAudioStreamSession::EnsureFireResultEvent[162383]: 1276758ms SPX_DBG_TRACE_SCOPE_EXIT: audio_stream_session.cpp:1560 EnsureFireResultEvent[162383]: 1276766ms SPX_DBG_TRACE_SCOPE_ENTER: audio_stream_session.cpp:1277 StopRecognizing[162383]: 1276773ms SPX_DBG_TRACE_VERBOSE: audio_stream_session.cpp:1278 [0xa23bda0]CSpxAudioStreamSession::StopRecognizing ...[162383]: 1276778ms SPX_DBG_TRACE_VERBOSE: audio_stream_session.cpp:3581 [0xa23bda0]CSpxAudioStreamSession::TryChangeState: recoKind/sessionState: 4/2 => 4/4[162383]: 1276784ms SPX_DBG_TRACE_VERBOSE: audio_stream_session.cpp:1356 [0xa23bda0]CSpxAudioStreamSession::StopRecognizing: We've been asked to stop whatever it is we're doing, while we're actively processing audio ...[162383]: 1276790ms SPX_DBG_TRACE_VERBOSE: audio_stream_session.cpp:1359 [0xa23bda0]CSpxAudioStreamSession::StopRecognizing: Now StoppingPump[0x9f9ec58] ...Here is the code used :
global AZURE_OUTPUTS
global SEG_COUNTER
SEG_COUNTER = 1
AZURE_OUTPUTS = []
def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
print('Canceled event')
def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
print('SessionStopped event')
def processWords(paraDict):
speaker = paraDict.get('SpeakerName', 'Unknown')
text = paraDict.get('Text', '')
return speaker, text
def conversation_transcriber_transcribed_cb(evt: speechsdk.SpeechRecognitionEventArgs):
try:
print('TRANSCRIBED:')
if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
paraDict = dict()
results = json.loads(evt.result.json)
startValue = results['Offset']/1e7
duration = results['Duration']/1e7
endValue = duration + startValue
displayText = results['DisplayText']
speaker = results['SpeakerId']
paraDict['SpeakerName'] = speaker
paraDict['Text'] = displayText
speaker, text = processWords(paraDict=paraDict)
if speaker != "Unknown":
speaker = 'SPEAKER_'+str(int(speaker.split('-')[1])).zfill(2)
print('\tText={}'.format(text))
print('\tSpeaker ID={}'.format(speaker))
print('\tOffset={}'.format(startValue))
print('\tDuration={}'.format(duration))
print('\tEndValue={}'.format(endValue))
output = {'text':text, 'speaker':speaker, 'startValue':startValue, 'endValue':endValue}
AZURE_OUTPUTS.append(output)
elif evt.result.reason == speechsdk.ResultReason.NoMatch:
print('\tNOMATCH: Speech could not be TRANSCRIBED: {}'.format(evt.result.no_match_details))
except:
print(traceback.format_exc())
logging.error(traceback.format_exc())
def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
print('SessionStarted event')
def recognize_from_file(filename, language):
import wave
import contextlib
with contextlib.closing(wave.open(filename,'r')) as f:
frames = f.getnframes()
rate = f.getframerate()
duration = frames / float(rate)
print(f'duration (seconds) : {duration}')
# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
#### run source .env before running the script to get the environment variables
speech_config = speechsdk.SpeechConfig(subscription=os.environ.get('SPEECH_KEY'), region=os.environ.get('SPEECH_REGION'))
#speech_config = speechsdk.SpeechConfig(subscription='454e3547c097468f9be4771867b71af5', region='westeurope')
speech_config.speech_recognition_language=language
speech_config.enable_dictation()
# Set conversation ending detection timeout (4 hours in seconds)
conversation_ending_detection_timeout = duration
speech_config.set_service_property("speechsdk.PropertyId.Speech_SegmentationSilenceTimeoutMs", str(conversation_ending_detection_timeout * 1000), speechsdk.ServicePropertyChannel.UriQueryParameter)
audio_config = speechsdk.audio.AudioConfig(filename=filename)
conversation_transcriber = speechsdk.transcription.ConversationTranscriber(speech_config=speech_config, audio_config=audio_config)
transcribing_stop = False
def stop_cb(evt: speechsdk.SessionEventArgs):
#"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
nonlocal transcribing_stop
transcribing_stop = True
# Connect callbacks to the events fired by the conversation transcriber
conversation_transcriber.transcribed.connect(conversation_transcriber_transcribed_cb)
conversation_transcriber.session_started.connect(conversation_transcriber_session_started_cb)
conversation_transcriber.session_stopped.connect(conversation_transcriber_session_stopped_cb)
conversation_transcriber.canceled.connect(conversation_transcriber_recognition_canceled_cb)
# stop transcribing on either session stopped or canceled events
conversation_transcriber.session_stopped.connect(stop_cb)
conversation_transcriber.canceled.connect(stop_cb)
conversation_transcriber.start_transcribing_async()
# Waits for completion.
while not transcribing_stop:
time.sleep(.5)
conversation_transcriber.stop_transcribing_async()
filename = "xxxxxxxx.wav" #JP
if __name__ == "__main__":
recognize_from_file(filename, 'ja-JP')