TextCatalog.ProduceNgrams 方法
定义
重要
一些信息与预发行产品相关,相应产品在发行之前可能会进行重大修改。 对于此处提供的信息,Microsoft 不作任何明示或暗示的担保。
创建一个 NgramExtractingEstimator 生成输入文本中遇到的连续) 单词序列的 n 元克计数 (序列的向量。
public static Microsoft.ML.Transforms.Text.NgramExtractingEstimator ProduceNgrams (this Microsoft.ML.TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = default, int ngramLength = 2, int skipLength = 0, bool useAllLengths = true, int maximumNgramsCount = 10000000, Microsoft.ML.Transforms.Text.NgramExtractingEstimator.WeightingCriteria weighting = Microsoft.ML.Transforms.Text.NgramExtractingEstimator+WeightingCriteria.Tf);
static member ProduceNgrams : Microsoft.ML.TransformsCatalog.TextTransforms * string * string * int * int * bool * int * Microsoft.ML.Transforms.Text.NgramExtractingEstimator.WeightingCriteria -> Microsoft.ML.Transforms.Text.NgramExtractingEstimator
<Extension()>
Public Function ProduceNgrams (catalog As TransformsCatalog.TextTransforms, outputColumnName As String, Optional inputColumnName As String = Nothing, Optional ngramLength As Integer = 2, Optional skipLength As Integer = 0, Optional useAllLengths As Boolean = true, Optional maximumNgramsCount As Integer = 10000000, Optional weighting As NgramExtractingEstimator.WeightingCriteria = Microsoft.ML.Transforms.Text.NgramExtractingEstimator+WeightingCriteria.Tf) As NgramExtractingEstimator
参数
- catalog
- TransformsCatalog.TextTransforms
与文本相关的转换的目录。
- inputColumnName
- String
要转换的列的名称。 If set to null
, the value of the outputColumnName
will be used as source.
此估算器对键数据类型的向量进行操作。
- ngramLength
- Int32
Ngram 长度。
- skipLength
- Int32
要在每个 n 元语法之间跳过的令牌数。 默认情况下,不会跳过任何令牌。
- useAllLengths
- Boolean
是否包括所有 n 元语法长度, ngramLength
最大或仅 ngramLength
包含 。
- maximumNgramsCount
- Int32
要在字典中存储的最大 n 元语法数。
用于评估单词或 n 元语法对语料库中的文档的重要性的统计度量值。
如果 maximumNgramsCount
小于遇到的 n 元克总数,则此度量值用于确定要保留的 n 元语法。
返回
示例
using System;
using System.Collections.Generic;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Transforms.Text;
namespace Samples.Dynamic
{
public static class ProduceNgrams
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for
// exception tracking and logging, as well as the source of randomness.
var mlContext = new MLContext();
// Create a small dataset as an IEnumerable.
var samples = new List<TextData>()
{
new TextData(){ Text = "This is an example to compute n-grams." },
new TextData(){ Text = "N-gram is a sequence of 'N' consecutive " +
"words/tokens." },
new TextData(){ Text = "ML.NET's ProduceNgrams API produces " +
"vector of n-grams." },
new TextData(){ Text = "Each position in the vector corresponds " +
"to a particular n-gram." },
new TextData(){ Text = "The value at each position corresponds " +
"to," },
new TextData(){ Text = "the number of times n-gram occurred in " +
"the data (Tf), or" },
new TextData(){ Text = "the inverse of the number of documents " +
"that contain the n-gram (Idf)," },
new TextData(){ Text = "or compute both and multiply together " +
"(Tf-Idf)." },
};
// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);
// A pipeline for converting text into numeric n-gram features.
// The following call to 'ProduceNgrams' requires the tokenized
// text /string as input. This is achieved by calling
// 'TokenizeIntoWords' first followed by 'ProduceNgrams'. Please note
// that the length of the output feature vector depends on the n-gram
// settings.
var textPipeline = mlContext.Transforms.Text.TokenizeIntoWords("Tokens",
"Text")
// 'ProduceNgrams' takes key type as input. Converting the tokens
// into key type using 'MapValueToKey'.
.Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
.Append(mlContext.Transforms.Text.ProduceNgrams("NgramFeatures",
"Tokens",
ngramLength: 3,
useAllLengths: false,
weighting: NgramExtractingEstimator.WeightingCriteria.Tf));
// Fit to data.
var textTransformer = textPipeline.Fit(dataview);
var transformedDataView = textTransformer.Transform(dataview);
// Create the prediction engine to get the n-gram features extracted
// from the text.
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData,
TransformedTextData>(textTransformer);
// Convert the text into numeric features.
var prediction = predictionEngine.Predict(samples[0]);
// Print the length of the feature vector.
Console.WriteLine("Number of Features: " + prediction.NgramFeatures
.Length);
// Preview of the produced n-grams.
// Get the slot names from the column's metadata.
// The slot names for a vector column corresponds to the names
// associated with each position in the vector.
VBuffer<ReadOnlyMemory<char>> slotNames = default;
transformedDataView.Schema["NgramFeatures"].GetSlotNames(ref slotNames);
var NgramFeaturesColumn = transformedDataView.GetColumn<VBuffer<
float>>(transformedDataView.Schema["NgramFeatures"]);
var slots = slotNames.GetValues();
Console.Write("N-grams: ");
foreach (var featureRow in NgramFeaturesColumn)
{
foreach (var item in featureRow.Items())
Console.Write($"{slots[item.Key]} ");
Console.WriteLine();
}
// Print the first 10 feature values.
Console.Write("Features: ");
for (int i = 0; i < 10; i++)
Console.Write($"{prediction.NgramFeatures[i]:F4} ");
// Expected output:
// Number of Features: 52
// N-grams: This|is|an is|an|example an|example|to example|to|compute to|compute|n-grams. N-gram|is|a is|a|sequence a|sequence|of sequence|of|'N' of|'N'|consecutive ...
// Features: 1.0000 1.0000 1.0000 1.0000 1.0000 0.0000 0.0000 0.0000 0.0000 0.0000 ...
}
private class TextData
{
public string Text { get; set; }
}
private class TransformedTextData : TextData
{
public float[] NgramFeatures { get; set; }
}
}
}