Hi
I have written python code to convert speech to text using Azure Cognitive services.
I used following code as I need 'Speaker diarization' and word-level-timestamps.
Note:
I tried this "transcriber.start_transcribing_async" instead of this "transcriber.start_continuous_recognition_async()".
The reason being, I get 'NotImplementedError' error when I use continous_recognition_ascyn() may be it is not implemented for "speechsdk.transcription.ConversationTranscriber"
Everything works fine as per the functionality.
However, the concern here is, I get the response ( the Transcribed content) in the form of sentences or paragraphs. I expect the converted text immediately (with accepted latency of say 1/2 seconds). I mean realtime. Something like words or phrases coming in response instead of a big sentence or paragraph.
I did go through Azure Cognitive services documentation. I did not find one specific property or code sample to cut down the latency in response.
Here is my code:
Note: I am sending the frames from a wav file to API and expecting the text of that frames
Any directions would be much helpful
def conversation_transcriber_recognition_canceled_cb(evt: speechsdk.SessionEventArgs):
print('Canceled event' + str(evt.result))
logger.info('Canceled event' + str(evt.result))
def conversation_transcriber_session_stopped_cb(evt: speechsdk.SessionEventArgs):
print('SessionStopped event')
logger.info('SessionStopped event')
def conversation_transcriber_transcribed_cb(evt: speechsdk.SpeechRecognitionEventArgs):
print('TRANSCRIBED:')
logger.info("Fetching the 'TRANSCRIBED content'...")
try:
paraDict = dict()
results = json.loads(evt.result.json)
displayText = results['DisplayText']
print("displayText-->" + displayText)
speakerName = results['SpeakerId']
paraDict['SpeakerName'] = speakerName
paraDict['Text'] = displayText
fileFormat = datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
logger.info(fileFormat + " - ")
logger.info(paraDict)
processWords(paraDict=paraDict)
# write results JSON to a file for later processing.
resultsJSONFileObj.write(json.dumps(results) + ",\n")
resultsJSONFileObj.flush()
except Exception:
print(traceback.format_exc())
logging.error(traceback.format_exc())
def conversation_transcriber_session_started_cb(evt: speechsdk.SessionEventArgs):
print('SessionStarted event')
logger.info('SessionStarted event')
def push_stream_writer(stream):
# The number of bytes to push per buffer
n_bytes = 3200 * 4
# start pushing data until all data has been read from the file
try:
wav_fh = wave.open(audio_file)
while True:
frames = wav_fh.readframes(n_bytes // 2)
# print('read {} bytes'.format(len(frames)))
if (len(frames) == 0):
logger.info('waiting for the frames.... length = 0')
time.sleep(2)
continue
if not frames:
break
stream.write(frames)
time.sleep(.1)
finally:
wav_fh.close()
stream.close() # must be done to signal the end of stream
def conversation_transcription():
"""transcribes a conversation"""
# Creates speech configuration with subscription information
speech_config = speechsdk.SpeechConfig(
subscription=subscription_key, region=region)
speech_config.enable_dictation()
speech_config.output_format = speechsdk.OutputFormat(1)
speech_config.request_word_level_timestamps()
channels = 1
bits_per_sample = 16
samples_per_second = 16000
# Create audio configuration using the push stream
wave_format = speechsdk.audio.AudioStreamFormat(
samples_per_second, bits_per_sample, channels)
stream = speechsdk.audio.PushAudioInputStream(stream_format=wave_format)
audio_config = speechsdk.audio.AudioConfig(stream=stream)
# Set conversation ending detection timeout (4 hours in seconds)
conversation_ending_detection_timeout = 4 * 60 * 60
# speech_config.set_service_property("conversationEndSilenceTimeoutMs", str(
# conversation_ending_detection_timeout * 1000), speechsdk.ServicePropertyChannel.UriQueryParameter)
# OR
# Set conversation ending detection timeout (4 hours in seconds)
speech_config.set_service_property(str(speechsdk.PropertyId.Speech_SegmentationSilenceTimeoutMs), str(
conversation_ending_detection_timeout * 1000), speechsdk.ServicePropertyChannel.UriQueryParameter)
valueHere = speechsdk.PropertyId.SpeechServiceResponse_SynthesisFirstByteLatencyMs
logger.info("speechsdk.PropertyId.SpeechServiceResponse_SynthesisFirstByteLatencyMs-->" + str(valueHere))
speech_config.set_service_property(str(speechsdk.PropertyId.SpeechServiceResponse_SynthesisFirstByteLatencyMs),str(500), speechsdk.ServicePropertyChannel.UriQueryParameter)
speech_config.set_service_property(str(speechsdk.PropertyId.SpeechServiceResponse_SynthesisFinishLatencyMs),str(500), speechsdk.ServicePropertyChannel.UriQueryParameter)
transcriber = speechsdk.transcription.ConversationTranscriber(
speech_config, audio_config)
# start push stream writer thread
push_stream_writer_thread = threading.Thread(
target=push_stream_writer, args=[stream])
push_stream_writer_thread.start()
time.sleep(.1)
done = False
def stop_cb(evt: speechsdk.SessionEventArgs):
"""callback that signals to stop continuous transcription upon receiving an event `evt`"""
print('CLOSING {}'.format(evt))
nonlocal done
done = True
# Subscribe to the events fired by the conversation transcriber
# transcriber.transcribing.connect(conversation_transcriber_transcribing_cb)
transcriber.transcribed.connect(conversation_transcriber_transcribed_cb)
transcriber.session_started.connect(
conversation_transcriber_session_started_cb)
transcriber.session_stopped.connect(
conversation_transcriber_session_stopped_cb)
transcriber.canceled.connect(
conversation_transcriber_recognition_canceled_cb)
# stop continuous transcription on either session stopped or canceled events
transcriber.session_stopped.connect(stop_cb)
transcriber.canceled.connect(stop_cb)
transcriber.start_transcribing_async()
# Waits for completion.
while not done:
time.sleep(.1)
transcriber.stop_transcribing_async()