Share via


Note

Please see Azure Cognitive Services for Speech documentation for the latest supported speech solutions.

EMMA Document Examples

This topic presents two examples of EMMA documents. You can use the first example as input to Simulator. The second example is the output generated by Simulator from the input file example. You can use the Simulator output file as input to the Simulator Results Analyzer tool.

Example 1: EMMA Input File

The following is an example of an EMMA file that you can use as input for Simulator. You can find this file under the Samples folder where you installed the Microsoft Speech Platform SDK 11. The audio files and grammars referenced in this document are in the Waves and Grammars folders, respectively.

<?xml version="1.0" encoding="utf-8"?>

<!-- The emma:emma node is the topmost node that defines the emma namespace and the 
supporting ms namespace for richer recognition result information. -->

<emma:emma version="1.0" xmlns:ms=https://www.microsoft.com/xmlns/webreco 
  xmlns:emma="http://www.w3.org/2003/04/emma" xmlns="http://www.example.com/example">

  <!-- Below the emma:emma topmost node are two daughter nodes: emma:grammar nodes and 
  one or more emma:group nodes that contain utterance information. -->
  <!-- In this example, the grammars are located in a subfolder to where Simulator.exe 
  is being run. The "id" attribute tells the recognizer which grammars to activate 
  (that is, to compare the utterance against), when performing a recognition.-->

  <emma:grammar id="grammar0" ref="grammars\menu_choices.grxml" />
  <emma:grammar id="grammar1" ref="grammars\help_choices.grxml" />

  <!-- A set of utterances are contained in an emma:group node. In this example, each 
  daughter node is also an emma:group node because each utterance may not be from the 
  same speaker.  -->

  <emma:group id="toplevel">

    <!-- The first utterance in the sequence. -->

    <emma:group id="utterance_1">
      <emma:info>

        <!-- The ms:audio element tells the tool where to find the WAV file. In this 
        example, the WAV file is located in a subfolder beneath the tool. The type 
        attribute indicates that it's of WAV format (and not emulation). -->

        <ms:audio ref="waves/utt_1.wav" type="audio/x-wav" />

        <!-- The transcript is helpful to later compare the recognition results to the 
        actual words that we know are contained in the WAV file. -->

        <ms:transcript>
          <ms:original>sports</ms:original>
        </ms:transcript>

        <!-- The active-grammars element tells the recognizer what grammars to activate 
        for this utterance. -->

        <ms:active-grammars>
          <ms:grammar emma:grammar-ref="grammar0" />
          <ms:grammar emma:grammar-ref="grammar1" />
        </ms:active-grammars>
      </emma:info>
    </emma:group>
 
    <!-- The second utterance in the sequence. --> 

    <emma:group id="utterance_2">
      <!-- Information about the second utterance... -->
    </emma:group>
   
    <!-- The third utterance in the sequence. -->

    <emma:group id="utterance_3">
      <!-- Information about the third utterance... -->
    </emma:group>

    <!-- Potentially, information about additional utterances to be recognized. -->

  </emma:group>
</emma:emma>

Example 2: EMMA Output File

Simulator takes the EMMA input file of the first example and produces the following EMMA output file, which can be used as input to Simulator Results Analyzer. You can find the example file under the Samples folder where you installed the Speech Platform SDK 11. The audio files and grammars referenced in this document are in the Waves and Grammars folders, respectively.

<?xml version="1.0" encoding="utf-8"?>

<emma:emma version="1.0" xmlns:ms="https://www.microsoft.com/xmlns/webreco" 
xmlns:emma="http://www.w3.org/2003/04/emma" xmlns="http://www.example.com/example">
  
  <!-- The emma:grammar elements here correspond to the grammars used in each utterance;
  Specific recognition results are contained below in each emma:group element. -->
   
  <emma:grammar id="grammar0" ref="grammars\menu_choices.grxml" />
  <emma:grammar id="grammar1" ref="grammars\help_choices.grxml" />
 
  <!-- Note that additional id information may appear here in more emma:grammar tags,     
  depending on the recognizer. -->
  
  <emma:group id="toplevel">
    <emma:group id="utterance_1">

      <!-- The recognition results are split between the emma:info element and a 
      sister emma:one-of element. The former contains emulation information, 
      active grammars, and request id's for server based recognizers.  The latter 
      contains interpretation information based the semantics specified in 
      the grammar.-->

      <emma:info ms:status="OK">
        <ms:audio ref="waves/utt_1.wav" type="audio/x-wav" />

        <!-- Because the transcript value was included for this utterance, 
        the result under the ms:transcript element is the emulation result. 
        That is, because "sports" was indicated as the transcript, the 
        recognizer tells us that the text equivalent of "sports" is indeed 
        recognized by the grammar referenced to 'grammar-0'. -->

        <ms:transcript ms:status="OK">
          <ms:original>sports</ms:original>
          <emma:one-of id="utterance_1-transcript-reco-nbest-list" 
            disjunction-type="understanding" emma:medium="acoustic" emma:mode="voice">
            <emma:interpretation emma:time-ref-uri="#reco-nbest-list" 
              emma:tokens="sports" id="utterance_1-transcript-reco-nbest-1" 
              emma:offset-to-start="0" emma:confidence="1" ms:typespace="ECMA-262" 
              ms:dataType="object" emma:grammar-ref="utterance_0.0.0.0-transcript-grammar-0"   
              emma:duration="0" emma:lang="en-us">
              <application emma:confidence="1" ms:actualConfidence="1" ms:dataType="string" 
                ms:valueType="string" >SPORTS</application>
              <grammar emma:confidence="1" ms:actualConfidence="1" ms:dataType="string" 
                ms:valueType="string" >menu_choices.grxml</grammar>
            </emma:interpretation>
          </emma:one-of>
        </ms:transcript>
        <ms:active-grammars>
          <ms:grammar emma:grammar-ref="grammar0" />
          <ms:grammar emma:grammar-ref="grammar1" />
        </ms:active-grammars>
        <ms:requestid>2c25a256-19df-47b9-8b0c2e7e232</ms:requestid>
        <ms:sessionid>a2b68525-e8c6-49b6-9b1e0667cf6</ms:sessionid>
      </emma:info>
  
      <!-- This emma:one-of element includes the recognition result. -->  
  
      <emma:one-of id="utterance_0.0.0.0-reco-nbest-list" emma:start="1262633376270" 
      disjunction-type="understanding" emma:medium="acoustic" emma:mode="voice">
        <emma:interpretation emma:time-ref-uri="#reco-nbest-list" emma:tokens="sports" 
          id="utterance_1-reco-nbest-1" emma:offset-to-start="300" 
          emma:confidence="0.9641953" ms:typespace="ECMA-262" ms:dataType="object" 
          emma:grammar-ref="utterance_0.0.0.0-grammar-0" emma:duration="850" 
          emma:lang="en-us">
          <emma:derived-from resource="#utterance_1-rule-tree-1" composite="false" />
          <application emma:confidence="0.9445539" ms:actualConfidence="1" 
            ms:dataType="string" ms:valueType="string" >SPORTS</application>
          <grammar emma:confidence="0.9445539" ms:actualConfidence="1" 
            ms:dataType="string" ms:valueType="string" >menu_choices.grxml</grammar>
        </emma:interpretation>
      </emma:one-of>
    </emma:group>

    <emma:group id="utterance_2">
      <!-- Information about the second utterance... -->
    </emma:group>
    
    <emma:group id="utterance_3">
      <!-- Information about the third utterance... -->
    </emma:group>
  </emma:group>

  <!-- The emma:derivation node contains rich recognition result information, including 
  duration, confidence, lattice, and phoneme structure.-->

  <emma:derivation>
    <emma:one-of id="utterance_1-word-list" disjunction-type="recognition">
      <emma:interpretation id="utterance_1-word-list-1">
        <emma:lattice emma:time-ref-uri="#reco-nbest-1" initial="1" final="2">
          <emma:arc emma:offset-to-start="0" from="1" emma:confidence="0.9641953" 
            emma:duration="770" to="2">sports<emma:info><ms:sapiPhraseElement 
            ms:displayAttributes="2" ms:lexicalForm="sports" ms:actualConfidence="1" 
            ms:pronunciation="S P AO RA T S" /></emma:info></emma:arc>
        </emma:lattice>
      </emma:interpretation>
    </emma:one-of>
    <emma:one-of id="utterance_1-rule-tree-list" disjunction-type="recognition">
      <emma:interpretation emma:time-ref-uri="#reco-nbest-1" 
        id="utterance_1-rule-tree-1" emma:confidence="0.9641953">
        <emma:derived-from resource="#utterance_1-word-list-1" composite="false" />
        <top emma:offset-to-start="0" emma:confidence="0.9641953" 
          ms:actualConfidence="1" emma:duration="770" >sports</top>
      </emma:interpretation>
    </emma:one-of>
    
    <emma:one-of id="utterance_2-word-list" disjunction-type="recognition">
      <!-- Information about the second utterance... -->
    </emma:one-of>
    
    <emma:one-of id="utterance_2-rule-tree-list" disjunction-type="recognition">
      <!-- Information about the second utterance... -->
    </emma:one-of>

    <emma:one-of id="utterance_3-word-list" disjunction-type="recognition">
      <!-- Information about the third utterance... -->
    </emma:one-of>
    
    <emma:one-of id="utterance_3-rule-tree-list" disjunction-type="recognition">
      <!-- Information about the third utterance... -->
    </emma:one-of>

  </emma:derivation>

</emma:emma>