Hi,
I try to use the speech-to-text API from within Azure Machine Learning Studio on an MP3 file.
I installed GStreamer using the instructions for Ubuntu/Debian on https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/how-to-use-codec-compressed-audio-input-streams?tabs=linux%2Cdebian%2Cjava-android%2Cterminal&pivots=programming-language-python
However, when I then try to run my code, he gives an error. I put my error log and my code block below.
On stackoverflow I found a solution for Windows that had something to do with the Gstreamer version installed. Are the Ubuntu instructions on your site also referring to a wrong version or is this error related to something else I'm missing ?
Could it eg. have something to do that I should also build an authentication token still too as explained on: https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/how-to-configure-azure-ad-auth?tabs=portal&pivots=programming-language-python:
resourceId = "Your Resource ID"
region = "Your Region"
You need to include the "aad#" prefix and the "#" (hash) separator between resource ID and AAD access token. authorizationToken = "aad#" + resourceId + "#" + aadToken.token
speechConfig = speechsdk.SpeechConfig(auth_token=authorizationToken, region=region)
before running:
speech_config = speechsdk.SpeechConfig(subscription=key, region=regio)
Error log:
Input In [4], in pull_audio_input_stream_compressed_mp3(mp3_file_path, taal, key, regio)
1 def pull_audio_input_stream_compressed_mp3(mp3_file_path: str, taal, key, regio):
2 # Create a compressed format
3 compressed_format = speechsdk.audio.AudioStreamFormat(compressed_stream_format=speechsdk.AudioStreamContainerFormat.MP3)
----> 4 result=compressed_stream_helper(compressed_format, mp3_file_path, taal, key, regio)
6 return result
Input In [3], in compressed_stream_helper(compressed_format, mp3_file_path, taal, key, regio)
6 speech_config.speech_recognition_language=taal
7 audio_config = speechsdk.audio.AudioConfig(stream=stream)
----> 9 speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
11 done = False
13 def stop_cb(evt):
File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/azure/cognitiveservices/speech/speech.py:1004, in SpeechRecognizer.init(self, speech_config, audio_config, language, source_language_config, auto_detect_source_language_config)
1002 audio_config_handle = audio_config._handle if audio_config is not None else None
1003 if language is None and source_language_config is None and auto_detect_source_language_config is None:
-> 1004 _call_hr_fn(
1005 fn=_sdk_lib.recognizer_create_speech_recognizer_from_config,
1006 *[ctypes.byref(handle), speech_config._handle, audio_config_handle])
1007 elif language is not None:
1008 source_language_config = languageconfig.SourceLanguageConfig(language)
File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/azure/cognitiveservices/speech/interop.py:62, in _call_hr_fn(fn, *args)
60 fn.restype = _spx_hr
61 hr = fn(*args) if len(args) > 0 else fn()
---> 62 _raise_if_failed(hr)
File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/azure/cognitiveservices/speech/interop.py:55, in _raise_if_failed(hr)
53 def _raise_if_failed(hr: _spx_hr):
54 if hr != 0:
---> 55 __try_get_error(_spx_handle(hr))
56 raise RuntimeError(hr)
File /anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/azure/cognitiveservices/speech/interop.py:50, in __try_get_error(error_handle)
45 message = "Exception with error code: %s%s" % (
46 callstack if callstack is not None else "",
47 what if what is not None else code
48 )
49 _sdk_lib.error_release(error_handle)
---> 50 raise RuntimeError(message)
RuntimeError: Exception with error code:
[CALL STACK BEGIN]
/anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/azure/cognitiveservices/speech/libMicrosoft.CognitiveServices.Speech.core.so(+0x1638a5) [.....]
...
[CALL STACK END]
Exception with an error code: 0x29 (SPXERR_GSTREAMER_NOT_FOUND_ERROR)
Code:
class BinaryFileReaderCallback(speechsdk.audio.PullAudioInputStreamCallback):
<defined as on your page:
def compressed_stream_helper(compressed_format, mp3_file_path, taal, key, regio):
callback = BinaryFileReaderCallback(mp3_file_path)
stream = speechsdk.audio.PullAudioInputStream(stream_format=compressed_format, pull_stream_callback=callback)
#speech authentication needed too still?
#https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/how-to-configure-azure-ad-auth?tabs=portal&pivots=programming-language-python
speech_config = speechsdk.SpeechConfig(subscription=key, region=regio)
speech_config.speech_recognition_language=taal
audio_config = speechsdk.audio.AudioConfig(stream=stream)
speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)
done = False
def stop_cb(evt):
"""callback that signals to stop continuous recognition upon receiving an event `evt`"""
print('CLOSING on {}'.format(evt))
nonlocal done
done = True
# Connect callbacks to the events fired by the speech recognizer
speech_recognizer.recognizing.connect(lambda evt: print('RECOGNIZING: {}'.format(evt)))
speech_recognizer.recognized.connect(lambda evt: print('RECOGNIZED: {}'.format(evt)))
speech_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
speech_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
speech_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
# stop continuous recognition on either session stopped or canceled events
speech_recognizer.session_stopped.connect(stop_cb)
speech_recognizer.canceled.connect(stop_cb)
# Start continuous speech recognition
speech_recognizer.start_continuous_recognition()
while not done:
time.sleep(.5)
speech_recognizer.stop_continuous_recognition()
def pull_audio_input_stream_compressed_mp3(mp3_file_path: str, taal, key, regio):
# Create a compressed format
compressed_format = speechsdk.audio.AudioStreamFormat(compressed_stream_format=speechsdk.AudioStreamContainerFormat.MP3)
result=compressed_stream_helper(compressed_format, mp3_file_path, taal, key, regio)
return result
tekst=pull_audio_input_stream_compressed_mp3("file1.mp3", "fr-BE", "<my speech services key>", "<my speech services region>")