Share via

How to use Azure Speech-to-Text continuous recognition?

Kota Matuda 21 Reputation points
2022-12-19T14:20:48.38+00:00

Development Environment
HoloLens2
Unity 2020.3.11f
Visual Studio 2019
Microsoft Mixed Reality Toolkit v2.7.3
Microsoft.CognitiveServices.Speech package

I would like to Speech-to-text conversion non-user voice using HoloLens2.
I created the readBytes(byte[]) that audio data using MicStream.StreamCategory.ROOM_CAPTURE in the HoloToolkit.Unity.InputModule.MicStream Class(MicStream.cs).

using System;  
using System.Collections.Generic;  
using System.IO;  
using System.Text;  
using HoloToolkit.Unity.InputModule;  
using UnityEngine;  
using UnityEngine.UI;  
using System.Linq;  
  
public class Test : MonoBehaviour  
{  
    public static byte[] readBytes;  
  
    public MicStream.StreamCategory StreamType = MicStream.StreamCategory.ROOM_CAPTURE;  
  
    public bool KeepAllData = false;  
  
    public float InputGain = 1;  
  
    public readonly List<short> samplingData = new List<short>();  
  
    private bool _isStart = false;  
  
  
    private void Awake()  
    {  
        CheckForErrorOnCall(MicStream.MicInitializeCustomRate((int)StreamType, AudioSettings.outputSampleRate));  
    }  
  
    public void OnClickButton1()  
    {  
        samplingData.Clear();  
        CheckForErrorOnCall(MicStream.MicStartStream(KeepAllData, false));  
        CheckForErrorOnCall(MicStream.MicSetGain(InputGain));  
        _isStart = true;  
    }  
  
    public void OnClickButton2()  
    {  
        _isStart = false;  
        CheckForErrorOnCall(MicStream.MicStopStream());  
    }  
  
    private void OnDestroy()  
    {  
        CheckForErrorOnCall(MicStream.MicDestroy());  
    }  
  
    private async void OnAudioFilterRead(float[] buffer, int numChannels)  
    {  
        if (!_isStart) return;  
        lock (this)  
        {  
            CheckForErrorOnCall(MicStream.MicGetFrame(buffer, buffer.Length, numChannels));  
  
            foreach (var f in buffer)  
            {  
                samplingData.Add(FloatToInt16(f));  
            }  
        }  
        var convertBytes = ConvertBytes(samplingData);  
        readBytes = convertBytes.ToArray();  
    }  
  
    private IEnumerable<byte> ConvertBytes(List<short> sampleData)  
    {  
        foreach (var s in sampleData)  
        {  
            var bytes = BitConverter.GetBytes(s);  
            yield return bytes[0];  
            yield return bytes[1];  
        }  
    }  
  
    private void CheckForErrorOnCall(int returnCode)  
    {  
        MicStream.CheckForErrorOnCall(returnCode);  
    }  
  
    private static short FloatToInt16(float value)  
    {  
        var f = value * short.MaxValue;  
        if (f > short.MaxValue) f = short.MaxValue;  
        if (f < short.MinValue) f = short.MinValue;  
        return (short)f;  
    }  
}  

I would like to implement continuous recognition of Azure Speech service/Speech-to-text with reference to (how-to-recognize-speech).

Could you tell me what the script will look like if I implement them using the readBytes(byte[]) I created?
I would be grateful if you would send some type of information.

Community Center | Not monitored
{count} votes

Your answer

Answers can be marked as 'Accepted' by the question author and 'Recommended' by moderators, which helps users know the answer solved the author's problem.