String Indexing
The System.Globalization.StringInfo class provides methods that allow you to split a string into text elements and iterate through those text elements. A text element is a unit of text that is displayed as a single character, called a grapheme. A text element can be a base character, a surrogate pair, or a combining character sequence. For more information on surrogate pairs and combining character sequences, see Unicode Support for Surrogate Pairs and Combining Character Sequences.
Use the StringInfo.GetTextElementEnumerator method to create an enumerator that can iterate through the elements of a string. Use the StringInfo.ParseCombiningCharacters method to return the indexes of each base character, high surrogate, or control character in a specified string.
In the following code example, a string of Arabic characters containing combining character sequences is created. In strCombining, for example, the Unicode code U+0625 represents an Arabic base character (Arabic letter Alef with Hamza below), and the Unicode code U+0650 represents an Arabic combining character (Arabic Kasra). Together, these codes represent a combining character sequence and therefore must be parsed as a single text element. Next, a string containing surrogate pairs is created. In strSurrogates, for example, the Unicode code U+DACE represents a high surrogate and the Unicode code U+DEFF represents a low surrogate. Together, these codes represent a surrogate pair and must be parsed as a single text element. Each string is parsed once using the ParseCombiningCharacters method and again using the GetTextElementEnumerator method. Both methods correctly parse the text elements in strCombining at the indexes 0, 2, 3, 5, and 6. Both methods correctly parse the text elements in strSurrogates at the indexes 0, 2, 4, 5, and 6. The results of the parsing operations are displayed.
Imports System
Imports System.IO
Imports System.Globalization
Imports System.Text
Imports Microsoft.VisualBasic
Public Class StringInfoSample
Public Shared Sub Main()
' Creates a string with text elements at <0;2;3;5;6>.
' The Unicode code points specify Arabic
' combining character sequences.
Dim strCombining As String = ChrW(&H625) & ChrW(&H650) & _
ChrW(&H64A) & ChrW(&H647) & ChrW(&H64E) & ChrW(&H627) & _
ChrW(&H628) & ChrW(&H64C)
' Creates a string with text elements at <0;2;4;5;6>.
'The Unicode code points specify private surrogate pairs.
Dim strSurrogates As String = ChrW(&HDACE) & ChrW(&HDEFF) & _
ChrW(&HDAAF) & ChrW(&HDEFC) & "a" & ChrW(&HD8BF) & ChrW(&HDD99)
EnumerateTextElements(strCombining)
EnumerateTextElements(strSurrogates)
End Sub
Public Shared Sub EnumerateTextElements(str As String)
' Creates a TextElementEnumerator.
Dim TEIndices As Integer() = Nothing
Dim TEEnum As TextElementEnumerator = Nothing
' Parses the string using the ParseCombiningCharacters() method.
Console.WriteLine(ControlChars.Newline + "Parsing '{0}' Using _
ParseCombiningCharacters()...", str)
Dim i As Integer
TEIndices = StringInfo.ParseCombiningCharacters(str)
For i = 0 To (TEIndices.Length - 1) - 1
Console.WriteLine("Text Element {0} ({1}..{2})= {3}", i, _
TEIndices(i), TEIndices((i + 1)) - 1, _
str.Substring(TEIndices(i), TEIndices((i + 1)) - _
TEIndices(i)))
Next i
Console.WriteLine("Text Element {0} ({1}..{2})= {3}", i, _
TEIndices(i), str.Length - 1, str.Substring(TEIndices(i)))
' Parses the string using the GetTextElementEnumerator method.
Console.WriteLine(ControlChars.Newline + "Parsing '{0}' Using _
TextElementEnumerator...", str)
TEEnum = StringInfo.GetTextElementEnumerator(str)
Dim Continue As Boolean = False
Dim TECount As Integer = - 1
' Note: Begins at element -1 (none).
Continue = TEEnum.MoveNext()
While Continue
' Prints the current element.
' Both GetTextElement() and Current retrieve the current
' text element. The latter returns it as an Object.
TECount += 1
Console.WriteLine("Text Element {0} ({1}..{2})= {3}", TECount, _
TEEnum.ElementIndex, TEEnum.ElementIndex + _
TEEnum.GetTextElement().Length - 1, TEEnum.Current)
' Moves to the next element.
Continue = TEEnum.MoveNext()
End While
End Sub
End Class
using System;
using System.IO;
using System.Globalization;
using System.Text;
public class StringInfoSample
{
public static void Main()
{
// Creates a string with text elements at <0;2;3;5;6>.
// The Unicode code points specify Arabic
// combining character sequences.
string strCombining =
"\u0625\u0650\u064A\u0647\u064E\u0627\u0628\u064C";
// Creates a string with text elements at <0;2;4;5;6>.
// The Unicode code points specify private surrogate pairs.
string strSurrogates = "\uDACE\uDEFF\uDAAF\uDEFCa\uD8BF\uDD99";
EnumerateTextElements(strCombining);
EnumerateTextElements(strSurrogates);
}
public static void EnumerateTextElements(string str)
{
// Creates a TextElementEnumerator.
int[] TEIndices = null;
TextElementEnumerator TEEnum = null;
// Parses the string using the ParseCombiningCharacters() method.
Console.WriteLine
("\r\nParsing '{0}' Using ParseCombiningCharacters()...",str);
int i;
TEIndices = StringInfo.ParseCombiningCharacters(str);
for (i = 0; i < (TEIndices.Length - 1); i+)
{
Console.WriteLine
("Text Element {0} ({1}..{2})=
{3}",i,TEIndices[i],TEIndices[i+1] - 1,
str.Substring(TEIndices[i],TEIndices[i+1] - TEIndices[i]));
}
Console.WriteLine
("Text Element {0} ({1}..{2})= {3}",i,TEIndices[i],str.Length -
1, str.Substring(TEIndices[i]));
// Parses the string using the GetTextElementEnumerator method.
Console.WriteLine
("\r\nParsing '{0}' Using TextElementEnumerator...",str);
TEEnum = StringInfo.GetTextElementEnumerator(str);
bool Continue = false;
int TECount = -1;
// Note: Begins at element -1 (none).
Continue = TEEnum.MoveNext();
while (Continue)
{
// Prints the current element.
// Both GetTextElement() and Current retrieve the current
// text element. The latter returns it as an Object.
TECount++;
Console.WriteLine("Text Element {0} ({1}..{2})=
{3}",TECount,TEEnum.ElementIndex,
TEEnum.ElementIndex + TEEnum.GetTextElement().Length - 1,
TEEnum.Current);
// Moves to the next element.
Continue = TEEnum.MoveNext();
}
}
}
Note
If you execute this code in a console application, the specified Unicode text elements will not be displayed correctly because the console environment does not support all the Unicode characters.