We are getting false positive results in the function of separating interlocutors (diarization) through batch transcription when many speakers talk simultaneously in short time intervals. We are using client SDK version 1.30:
<groupId>com.microsoft.cognitiveservices.speech</groupId>
<artifactId>client-sdk</artifactId>
<version>1.30.0</version>
With the next snippet of code. Please do you have any recomendations to minimize this false positive results and improve diarization accuracy?
//-------------- 1. Create Transcription Job
JsonArray jaChannels = new JsonArray();
jaChannels.add(0);
JsonArray jaLocales = new JsonArray();
jaLocales.add("en-US");jaLocales.add("pt-BR");jaLocales.add("es-ES");
JsonArray jaContentURL = new JsonArray();
jaContentURL.add(this.audioURL);
JsonObject joSpeakers = new JsonObject();
joSpeakers.addProperty("minCount", 1);
joSpeakers.addProperty("maxCount", 10);
JsonObject joDiarization = new JsonObject();
joDiarization.add("speakers", joSpeakers);
JsonObject joLanguageIdentification = new JsonObject();
joLanguageIdentification.add("candidateLocales", jaLocales);
joLanguageIdentification.add("speechModelMapping", new JsonObject());
JsonObject joProperties = new JsonObject();
joProperties.addProperty("diarizationEnabled", true);
joProperties.addProperty("wordLevelTimestampsEnabled", false);
joProperties.addProperty("displayFormWordLevelTimestampsEnabled", false);
joProperties.add("channels", jaChannels);
joProperties.add("diarization", joDiarization);
joProperties.add("languageIdentification", joLanguageIdentification);
JsonObject jsonPayload = new JsonObject();
jsonPayload.addProperty("locale", this.locale);
jsonPayload.addProperty("displayName", this.media.getIdMid()+"_"+processStartTime);
jsonPayload.addProperty("description", this.media.getIdMid()+"_"+processStartTime);
jsonPayload.add("customProperties", new JsonObject());
jsonPayload.add("contentUrls", jaContentURL);
jsonPayload.add("properties", joProperties);
String request = new Gson().toJson(jsonPayload);
log.info("\n\t\t 1. Creating Transcription... uri: {}", this.speechTranscriptionAPI);
log.info(request);
RestResult result = RestHelper.sendPost(this.speechTranscriptionAPI, request, this.speechSubscriptionKey, new int[] { HttpURLConnection.HTTP_CREATED });
String transcriptionUri_1 = result.getJson().get("self").getAsString();
String[] transcriptionUri_2 = transcriptionUri_1.split("/");
String transcriptionId = transcriptionUri_2[transcriptionUri_2.length - 1];
try {
UUID uuid = UUID.fromString(transcriptionId); // Verify the transcription ID is a valid GUID.
} catch (IllegalArgumentException exception) {
throw new Exception(String.format("Unable to parse response from Create Transcription API:%s%s", System.lineSeparator(), result.getText()));
}
log.info("\n\t\tTranscription ID: {}", transcriptionId);
//-------------- 2. Get Transcription Status
String transcriptionStatusUri = this.speechTranscriptionAPI + "/" + transcriptionId;
log.info("\n\n\t\t 2. Getting Transcription Status... uri: {}", transcriptionStatusUri);
boolean done = false;
while (!done)
{
log.info("\n\t\tWaiting {} seconds for transcription to complete.", waitSeconds);
Thread.sleep(waitSeconds * 1000);
// Get Transcription response
result = RestHelper.sendGet(transcriptionStatusUri, this.speechSubscriptionKey, new int[] { HttpURLConnection.HTTP_OK });
String status = result.getJson().get("status").getAsString().toLowerCase();
log.info(String.format("\t\tTranscription Status: %s", status));
if (status.equals("failed")){
throw new Exception(String.format("Unable to transcribe audio input. Response:%s%s", System.lineSeparator(), result.getText()));
}
done = status.equals("succeeded");
}
//-------------- 3. Get Transcription Files
String transcriptionFilesUri = transcriptionStatusUri + "/files";
log.info("\n\t\t 3. Getting Transcription Files... uri: {}", transcriptionFilesUri);
RestResult transcriptionFiles = RestHelper.sendGet(transcriptionFilesUri, speechSubscriptionKey, new int[] { HttpURLConnection.HTTP_OK });
Optional<String> contentUri = Optional.empty();
Iterator<JsonElement> iterator = transcriptionFiles.getJson().getAsJsonArray("values").iterator();
while (iterator.hasNext()) {
JsonObject value = (JsonObject)iterator.next().getAsJsonObject();
if (value.get("kind").getAsString().toLowerCase().equals("transcription")){
contentUri = Optional.of(value.getAsJsonObject("links").get("contentUrl").getAsString());
break;
}
}
if (!contentUri.isPresent()) {
throw new Exception (String.format("Unable to parse response from Get Transcription Files API:%s%s", System.lineSeparator(), transcriptionFiles.getText()));
}
final String transcriptionUri = contentUri.get();
//-------------- 3. Get Transcription Content
log.info("\n\t\t 4. Getting Transcription Content... uri: {}", transcriptionUri);
RestResult transcriptionResult = RestHelper.sendGet(transcriptionUri, "", new int[] { HttpURLConnection.HTTP_OK });
JsonObject transcriptionJson = transcriptionResult.getJson();
//log.info(transcriptionJson);
MediaServer mediaServer = new MediaServer();
List<SpeakerIdResult> speakersList = new ArrayList<SpeakerIdResult>();
mediaServer.setSpeakerIdResults(speakersList);