TextCatalog.LatentDirichletAllocation 方法
定义
重要
一些信息与预发行产品相关,相应产品在发行之前可能会进行重大修改。 对于此处提供的信息,Microsoft 不作任何明示或暗示的担保。
创建一个 LatentDirichletAllocationEstimator,它使用 LightLDA 将表示为浮点向量的文本 (转换为一个向量 Single) ,以指示文本与标识每个主题的相似性。
public static Microsoft.ML.Transforms.Text.LatentDirichletAllocationEstimator LatentDirichletAllocation (this Microsoft.ML.TransformsCatalog.TextTransforms catalog, string outputColumnName, string inputColumnName = default, int numberOfTopics = 100, float alphaSum = 100, float beta = 0.01, int samplingStepCount = 4, int maximumNumberOfIterations = 200, int likelihoodInterval = 5, int numberOfThreads = 0, int maximumTokenCountPerDocument = 512, int numberOfSummaryTermsPerTopic = 10, int numberOfBurninIterations = 10, bool resetRandomGenerator = false);
static member LatentDirichletAllocation : Microsoft.ML.TransformsCatalog.TextTransforms * string * string * int * single * single * int * int * int * int * int * int * int * bool -> Microsoft.ML.Transforms.Text.LatentDirichletAllocationEstimator
<Extension()>
Public Function LatentDirichletAllocation (catalog As TransformsCatalog.TextTransforms, outputColumnName As String, Optional inputColumnName As String = Nothing, Optional numberOfTopics As Integer = 100, Optional alphaSum As Single = 100, Optional beta As Single = 0.01, Optional samplingStepCount As Integer = 4, Optional maximumNumberOfIterations As Integer = 200, Optional likelihoodInterval As Integer = 5, Optional numberOfThreads As Integer = 0, Optional maximumTokenCountPerDocument As Integer = 512, Optional numberOfSummaryTermsPerTopic As Integer = 10, Optional numberOfBurninIterations As Integer = 10, Optional resetRandomGenerator As Boolean = false) As LatentDirichletAllocationEstimator
参数
- catalog
- TransformsCatalog.TextTransforms
转换的目录。
- inputColumnName
- String
要转换的列的名称。 If set to null
, the value of the outputColumnName
will be used as source.
此估算器在向量 Single上运行 。
- numberOfTopics
- Int32
主题数。
- alphaSum
- Single
文档主题向量之前的 Dirichlet。
- beta
- Single
在 vocab-topic vectors 上之前使用 Dirichlet。
- samplingStepCount
- Int32
大都市哈斯廷步骤的数量。
- maximumNumberOfIterations
- Int32
迭代次数。
- likelihoodInterval
- Int32
在此迭代间隔内,计算本地数据集的计算日志可能性。
- numberOfThreads
- Int32
训练线程数。 默认值取决于逻辑处理器的数量。
- maximumTokenCountPerDocument
- Int32
每个文档的最大令牌计数的阈值。
- numberOfSummaryTermsPerTopic
- Int32
要汇总主题的单词数。
- numberOfBurninIterations
- Int32
燃烧迭代数。
- resetRandomGenerator
- Boolean
重置每个文档的随机数生成器。
返回
示例
using System;
using System.Collections.Generic;
using Microsoft.ML;
namespace Samples.Dynamic
{
public static class LatentDirichletAllocation
{
public static void Example()
{
// Create a new ML context, for ML.NET operations. It can be used for
// exception tracking and logging, as well as the source of randomness.
var mlContext = new MLContext();
// Create a small dataset as an IEnumerable.
var samples = new List<TextData>()
{
new TextData(){ Text = "ML.NET's LatentDirichletAllocation API " +
"computes topic models." },
new TextData(){ Text = "ML.NET's LatentDirichletAllocation API " +
"is the best for topic models." },
new TextData(){ Text = "I like to eat broccoli and bananas." },
new TextData(){ Text = "I eat bananas for breakfast." },
new TextData(){ Text = "This car is expensive compared to last " +
"week's price." },
new TextData(){ Text = "This car was $X last week." },
};
// Convert training data to IDataView.
var dataview = mlContext.Data.LoadFromEnumerable(samples);
// A pipeline for featurizing the text/string using
// LatentDirichletAllocation API. o be more accurate in computing the
// LDA features, the pipeline first normalizes text and removes stop
// words before passing tokens (the individual words, lower cased, with
// common words removed) to LatentDirichletAllocation.
var pipeline = mlContext.Transforms.Text.NormalizeText("NormalizedText",
"Text")
.Append(mlContext.Transforms.Text.TokenizeIntoWords("Tokens",
"NormalizedText"))
.Append(mlContext.Transforms.Text.RemoveDefaultStopWords("Tokens"))
.Append(mlContext.Transforms.Conversion.MapValueToKey("Tokens"))
.Append(mlContext.Transforms.Text.ProduceNgrams("Tokens"))
.Append(mlContext.Transforms.Text.LatentDirichletAllocation(
"Features", "Tokens", numberOfTopics: 3));
// Fit to data.
var transformer = pipeline.Fit(dataview);
// Create the prediction engine to get the LDA features extracted from
// the text.
var predictionEngine = mlContext.Model.CreatePredictionEngine<TextData,
TransformedTextData>(transformer);
// Convert the sample text into LDA features and print it.
PrintLdaFeatures(predictionEngine.Predict(samples[0]));
PrintLdaFeatures(predictionEngine.Predict(samples[1]));
// Features obtained post-transformation.
// For LatentDirichletAllocation, we had specified numTopic:3. Hence
// each prediction has been featurized as a vector of floats with length
// 3.
// Topic1 Topic2 Topic3
// 0.6364 0.2727 0.0909
// 0.5455 0.1818 0.2727
}
private static void PrintLdaFeatures(TransformedTextData prediction)
{
for (int i = 0; i < prediction.Features.Length; i++)
Console.Write($"{prediction.Features[i]:F4} ");
Console.WriteLine();
}
private class TextData
{
public string Text { get; set; }
}
private class TransformedTextData : TextData
{
public float[] Features { get; set; }
}
}
}