How to create an AudioConfig object given a public link to an audio file

Nikhil Kapse 0 Reputation points
2023-10-30T03:07:51.5466667+00:00

I'm currently using Azure's Speech Pronunciation Assessment service, and I'm receiving my input audio via a public url to an audio file (which is not in azure blob storage). Is there any way for me to create an AudioConfig without having to use local or Azure Blob storage? I'm aware I can't pass the url in directly to the AudioConfig constructor, but are there other ways to accomplish something like this?

Azure AI Speech
Azure AI Speech
An Azure service that integrates speech processing into apps and services.
1,728 questions
0 comments No comments
{count} votes

2 answers

Sort by: Most helpful
  1. dupammi 8,465 Reputation points Microsoft Vendor
    2023-10-30T09:35:10.7466667+00:00

    Hi @Nikhil Kapse ,

    Thank you for using the Microsoft Q&A.

    To create an AudioConfig object for Azure's Speech Pronunciation Assessment service given a public link to an audio file, you can follow these steps.

    Import the necessary libraries and configure your subscription key and region.

    import requests
    import base64
    import json
    import time
    import azure.cognitiveservices.speech as speechsdk
    subscriptionKey = "YOUR_SUBSCRIPTION_KEY"
    region = "YOUR_REGION"
    public_audio_url = "https://www.example.com/public_audio.wav"
    

    Initialize uploadFinishTime and define the wave header.

    uploadFinishTime = 0
    
    # Common wave header, with zero audio length
    WaveHeader16K16BitMono = bytes([82, 73, 70, 70, 78, 128, 0, 0, 87, 65, 86, 69, 102, 109, 116, 32, 18, 0, 0, 0, 1, 0, 1, 0, 128, 62, 0, 0, 0, 125, 0, 0, 2, 0, 16, 0, 0, 0, 100, 97, 116, 97, 0, 0, 0, 0])
    

    Create a generator function to read audio data chunk by chunk from the URL.

    def get_chunk_from_url(audio_url, chunk_size=1024):
        global uploadFinishTime  # Define uploadFinishTime as a global variable
        yield WaveHeader16K16BitMono
        with requests.get(audio_url, stream=True) as response:
            for chunk in response.iter_content(chunk_size=chunk_size):
                if not chunk:
                    uploadFinishTime = time.time()
                    break
                yield chunk
    
    

    Build pronunciation assessment parameters and request headers.

    referenceText = "Perhaps"
    pronAssessmentParamsJson = "{\"ReferenceText\":\"%s\",\"GradingSystem\":\"HundredMark\",\"Dimension\":\"Comprehensive\"}" % referenceText
    pronAssessmentParamsBase64 = base64.b64encode(bytes(pronAssessmentParamsJson, 'utf-8'))
    pronAssessmentParams = str(pronAssessmentParamsBase64, "utf-8")
    
    url = "https://%s.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?language=en-us" % region
    headers = {
        'Accept': 'application/json;text/xml',
        'Connection': 'Keep-Alive',
        'Content-Type': 'audio/wav; codecs=audio/pcm; samplerate=16000',
        'Ocp-Apim-Subscription-Key': subscriptionKey,
        'Pronunciation-Assessment': pronAssessmentParams,
        'Transfer-Encoding': 'chunked',
        'Expect': '100-continue'
    }
    

    Send the request with chunked data from the public audio URL.

    response = requests.post(url=url, data=get_chunk_from_url(public_audio_url), headers=headers)
    getResponseTime = time.time()
    
    resultJson = json.loads(response.text)
    print(json.dumps(resultJson, indent=4)
    
    # Check if uploadFinishTime is still 0 (not updated) and calculate latency
    if uploadFinishTime == 0:
        uploadFinishTime = time.time()  # Set it to the current time
    latency = getResponseTime - uploadFinishTime
    print("Latency = %sms" % int(latency * 1000))
    
    
    

    This code allows you to perform pronunciation assessment with Azure's Speech Pronunciation Assessment service using a public URL for the audio file, without the need for local or Azure Blob storage. The audio data is streamed from the URL, and the pronunciation assessment is performed based on the provided parameters.

    Output

    {
        "RecognitionStatus": "Success",
        "Offset": 190300000,
        "Duration": 3300000,
        "NBest": [
            {
                "Confidence": 0.99992526,
                "Lexical": "Perhaps",
                "ITN": "Perhaps",
                "MaskedITN": "perhaps",
                "Display": "Perhaps.",
                "AccuracyScore": 25.0,
                "FluencyScore": 0.0,
                "CompletenessScore": 0.0,
                "PronScore": 5.0,
                "Words": [
                    {
                        "Word": "Perhaps",
                        "Offset": 190300000,
                        "Duration": 3300000,
                        "Confidence": 0.0,
                        "AccuracyScore": 25.0,
                        "ErrorType": "Mispronunciation",
                        "Syllables": [
                            {
                                "Syllable": "paxr",
                                "Offset": 190300000,
                                "Duration": 1200000,
                                "AccuracyScore": 32.0
                            },
                            {
                                "Syllable": "haeps",
                                "Offset": 191600000,
                                "Duration": 2000000,
                                "AccuracyScore": 20.0
                            }
                        ],
                        "Phonemes": [
                            {
                                "Phoneme": "p",
                                "Offset": 190300000,
                                "Duration": 200000,
                                "AccuracyScore": 0.0
                            },
                            {
                                "Phoneme": "ax",
                                "Offset": 190600000,
                                "Duration": 500000,
                                "AccuracyScore": 31.0
                            },
                            {
                                "Phoneme": "r",
                                "Offset": 191200000,
                                "Duration": 300000,
                                "AccuracyScore": 57.0
                            },
                            {
                                "Phoneme": "h",
                                "Offset": 191600000,
                                "Duration": 500000,
                                "AccuracyScore": 35.0
                            },
                            {
                                "Phoneme": "ae",
                                "Offset": 192200000,
                                "Duration": 500000,
                                "AccuracyScore": 13.0
                            },
                            {
                                "Phoneme": "p",
                                "Offset": 192800000,
                                "Duration": 500000,
                                "AccuracyScore": 23.0
                            },
                            {
                                "Phoneme": "s",
                                "Offset": 193400000,
                                "Duration": 200000,
                                "AccuracyScore": 0.0
                            }
                        ]
                    }
                ]
            }
        ],
        "DisplayText": "Perhaps."
    }
    Latency = 0ms
    

    For more details on the technical documentation, go through the Speech-to-text and PronunciationAssessment sample


  2. dupammi 8,465 Reputation points Microsoft Vendor
    2023-10-30T16:10:32.9433333+00:00

    Hi @Nikhil Kapse ,

    Thank you for the response.

    If you have a public audio URL that points to a .ogg file and you want to perform pronunciation assessment without the need for local file storage, you can still achieve this by converting the .ogg file to the required .wav format in-memory and then proceed with pronunciation assessment using the converted WAV audio data.

    The process is similar to the previous code, but it handles .ogg audio files.

    Pip install:

    pip install soundfile

    Sample Code below.

    import requests
    import base64
    import json
    import time
    import azure.cognitiveservices.speech as speechsdk
    import soundfile as sf #read the .ogg audio and write it to the in-memory WAV audio.
    import io # in-memory 
    # Replace with your Azure subscription key and region
    subscriptionKey = "YOUR_SUBSCRIPTION_KEY" 
    region = "YOUR_REGION" 
    
    # Replace with the URL of the public .ogg audio file
    public_audio_url = "https://upload.wikimedia.org/wikipedia/commons/c/c8/Example.ogg"
    
    # Convert the .ogg audio to WAV format in-memory
    response = requests.get(public_audio_url)
    wav_audio = io.BytesIO()
    
    with sf.SoundFile(io.BytesIO(response.content), 'rb') as ogg_audio:
        sf.write(wav_audio, ogg_audio.read(), 16000, format='WAV')
    
    # Initialize uploadFinishTime to 0
    uploadFinishTime = 0
    
    # a common wave header, with zero audio length
    WaveHeader16K16BitMono = bytes([82, 73, 70, 70, 78, 128, 0, 0, 87, 65, 86, 69, 102, 109, 116, 32, 18, 0, 0, 0, 1, 0, 1, 0, 128, 62, 0, 0, 0, 125, 0, 0, 2, 0, 16, 0, 0, 0, 100, 97, 116, 97, 0, 0, 0, 0])
    
    # A generator which reads audio data chunk by chunk from the in-memory WAV audio
    def get_chunk_from_audio(wav_audio, chunk_size=1024):
        global uploadFinishTime  # Define uploadFinishTime as a global variable
        yield WaveHeader16K16BitMono
        wav_audio.seek(0)
        while True:
            chunk = wav_audio.read(chunk_size)
            if not chunk:
                uploadFinishTime = time.time()
                break
            yield chunk
    
    # Build pronunciation assessment parameters and request headers
    referenceText = "example"
    pronAssessmentParamsJson = "{\"ReferenceText\":\"%s\",\"GradingSystem\":\"HundredMark\",\"Dimension\":\"Comprehensive\"}" % referenceText
    pronAssessmentParamsBase64 = base64.b64encode(bytes(pronAssessmentParamsJson, 'utf-8'))
    pronAssessmentParams = str(pronAssessmentParamsBase64, "utf-8")
    
    url = "https://%s.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?language=en-us" % region
    headers = {
        'Accept': 'application/json;text/xml',
        'Connection': 'Keep-Alive',
        'Content-Type': 'audio/wav; codecs=audio/pcm; samplerate=16000',
        'Ocp-Apim-Subscription-Key': subscriptionKey,
        'Pronunciation-Assessment': pronAssessmentParams,
        'Transfer-Encoding': 'chunked',
        'Expect': '100-continue'
    }
    
    # Send request with chunked data from the in-memory WAV audio
    response = requests.post(url=url, data=get_chunk_from_audio(wav_audio), headers=headers)
    getResponseTime = time.time()
    
    resultJson = json.loads(response.text)
    print(json.dumps(resultJson, indent=4))
    
    # Check if uploadFinishTime is still 0 (not updated) and calculate latency
    if uploadFinishTime == 0:
        uploadFinishTime = time.time()  # Set it to the current time
    latency = getResponseTime - uploadFinishTime
    print("Latency = %sms" % int(latency * 1000))
    

    Hope this helps. Thanks!


    If this answers your query, do click Accept Answer and Yes for was this answer helpful.


Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.