Repetitive result issue with Azure Speech Recognition in .NET

Philipp Schleier 0 Reputation points
2024-07-19T17:23:12.1933333+00:00

I'm building a live mic speech-to-text transcription on Azure using .NET. However, a single sentence results in multiple repetitive sentences. I need help identifying the issue.

What I want is live transcription of a sentence that appears once without repetition, exactly how it works on Azure's Speech Studio - Real-time speech to text.

Here's my code:

Program.cs:

using Microsoft.AspNetCore.Hosting;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.DependencyInjection;
using Microsoft.Extensions.Hosting;
using Microsoft.Extensions.Logging;
using System;
using System.Linq;
using [ ].Hubs;
using [ ].Services;
using [ ].Data;

public class Program
{
    public static void Main(string[] args)
    {
        CreateHostBuilder(args).Build().Run();
    }

    public static IHostBuilder CreateHostBuilder(string[] args) =>
        Host.CreateDefaultBuilder(args)
            .ConfigureWebHostDefaults(webBuilder =>
            {
                webBuilder.UseUrls("http://localhost:5001");
                webBuilder.ConfigureServices((context, services) =>
                {
                    services.AddControllers();
                    services.AddSignalR();

                    // Configure CORS
                    services.AddCors(options =>
                    {
                        options.AddDefaultPolicy(builder =>
                        {
                            builder.WithOrigins("http://localhost:3000")
                                   .AllowAnyHeader()
                                   .AllowAnyMethod()
                                   .AllowCredentials();
                        });
                    });

                    services.AddSingleton<SpeechService>();
                    services.AddSingleton<TranscriptionService>();

                    // Add DbContext and hosted service
                    services.AddDbContext<CosmosDbContext>();
                    services.AddHostedService<CosmosDbTestService>();
                })
                .Configure((context, app) =>
                {
                    if (context.HostingEnvironment.IsDevelopment())
                    {
                        app.UseDeveloperExceptionPage();
                    }

                    // Enable CORS
                    app.UseCors();

                    app.UseRouting();

                    app.UseEndpoints(endpoints =>
                    {
                        endpoints.MapControllers();
                        endpoints.MapHub<TranscriptionHub>("/transcriptionHub");
                    });
                });
            })
            .ConfigureLogging(logging =>
            {
                logging.ClearProviders();
                logging.AddConsole();
            });
}

SpeechService.cs (/Service):

using Microsoft.CognitiveServices.Speech;
using Microsoft.CognitiveServices.Speech.Audio;

namespace [ ].Services
{
    public class SpeechService
    {
        private readonly SpeechRecognizer _speechRecognizer;
        private bool isRecognizing = false;
        public event Action<string>? OnRecognizing;
        public event Action<string>? OnRecognized;

        public SpeechService()
        {
            var subscriptionKey = Environment.GetEnvironmentVariable("AZURE_SPEECH_KEY");
            var region = Environment.GetEnvironmentVariable("AZURE_SPEECH_REGION");

            if (string.IsNullOrEmpty(subscriptionKey) || string.IsNullOrEmpty(region))
            {
                throw new InvalidOperationException("Azure Speech Service key and region must be provided via environment variables.");
            }

            var speechConfig = SpeechConfig.FromSubscription(subscriptionKey, region);
            speechConfig.SpeechRecognitionLanguage = "de-DE";
            speechConfig.EnableDictation(); // Enable dictation mode for explicit punctuation

            var audioConfig = AudioConfig.FromDefaultMicrophoneInput();
            _speechRecognizer = new SpeechRecognizer(speechConfig, audioConfig);

            _speechRecognizer.Recognizing += (s, e) =>
            {
                if (e.Result.Reason == ResultReason.RecognizingSpeech)
                {
                    OnRecognizing?.Invoke(e.Result.Text);
                }
            };

            _speechRecognizer.Recognized += (s, e) =>
            {
                if (e.Result.Reason == ResultReason.RecognizedSpeech)
                {
                    OnRecognized?.Invoke(e.Result.Text);
                }
            };

            _speechRecognizer.Canceled += (s, e) =>
            {
                isRecognizing = false;
                Console.WriteLine($"Recognition canceled: {e.Reason}, {e.ErrorDetails}");
            };

            _speechRecognizer.SessionStopped += (s, e) =>
            {
                isRecognizing = false;
                Console.WriteLine($"Session stopped: {e.SessionId}");
            };
        }

        public async Task StartRecognitionAsync()
        {
            if (!isRecognizing)
            {
                isRecognizing = true;
                await _speechRecognizer.StartContinuousRecognitionAsync().ConfigureAwait(false);
            }
        }

        public async Task StopRecognitionAsync()
        {
            if (isRecognizing)
            {
                await _speechRecognizer.StopContinuousRecognitionAsync().ConfigureAwait(false);
                isRecognizing = false;
            }
        }
    }
}

TranscriptionService.cs (/Service):

using Microsoft.AspNetCore.SignalR;
using System.Collections.Concurrent;
using [ ].Hubs;

namespace [ ].Services
{
    public class TranscriptionService
    {
        private readonly IHubContext<TranscriptionHub> _hubContext;
        private readonly ConcurrentDictionary<string, string> _connections = new ConcurrentDictionary<string, string>();

        public TranscriptionService(IHubContext<TranscriptionHub> hubContext)
        {
            _hubContext = hubContext;
        }

        public void AddConnection(string connectionId)
        {
            _connections[connectionId] = connectionId;
        }

        public void RemoveConnection(string connectionId)
        {
            _connections.TryRemove(connectionId, out _);
        }

        public async Task BroadcastRecognizing(string text)
        {
            foreach (var connectionId in _connections.Keys)
            {
                await _hubContext.Clients.Client(connectionId).SendAsync("ReceiveRecognizing", text);
            }
        }

        public async Task BroadcastRecognized(string text)
        {
            foreach (var connectionId in _connections.Keys)
            {
                await _hubContext.Clients.Client(connectionId).SendAsync("ReceiveRecognized", text);
            }
        }
    }
}

TranscriptionHub.cs (/Hub):

using Microsoft.AspNetCore.SignalR;
using [ ].Services;

namespace [ ].Hubs
{
    public class TranscriptionHub : Hub
    {
        private readonly SpeechService _speechService;
        private readonly TranscriptionService _transcriptionService;

        public TranscriptionHub(SpeechService speechService, TranscriptionService transcriptionService)
        {
            _speechService = speechService;
            _transcriptionService = transcriptionService;
        }

        public override async Task OnConnectedAsync()
        {
            _transcriptionService.AddConnection(Context.ConnectionId);
            _speechService.OnRecognizing += HandleRecognizing;
            _speechService.OnRecognized += HandleRecognized;
            await base.OnConnectedAsync();
        }

        public override async Task OnDisconnectedAsync(Exception? exception)
        {
            _transcriptionService.RemoveConnection(Context.ConnectionId);
            _speechService.OnRecognizing -= HandleRecognizing;
            _speechService.OnRecognized -= HandleRecognized;
            await base.OnDisconnectedAsync(exception);
        }

        private async void HandleRecognizing(string text)
        {
            await _transcriptionService.BroadcastRecognizing(text);
        }

        private async void HandleRecognized(string text)
        {
            await _transcriptionService.BroadcastRecognized(text);
        }

        public async Task StartTranscription()
        {
            await _speechService.StartRecognitionAsync();
        }

        public async Task StopTranscription()
        {
            await _speechService.StopRecognitionAsync();
        }
    }
}
Azure AI Speech
Azure AI Speech
An Azure service that integrates speech processing into apps and services.
1,555 questions
{count} votes