다음을 통해 공유


KMeansClusteringExtensions.KMeans 메서드

정의

오버로드

KMeans(ClusteringCatalog+ClusteringTrainers, KMeansTrainer+Options)

를 사용하여 KMeansTrainerKMeans++ 클러스터링 알고리즘을 학습시킵니다.

KMeans(ClusteringCatalog+ClusteringTrainers, String, String, Int32)

를 사용하여 KMeansTrainerKMeans++ 클러스터링 알고리즘을 학습시킵니다.

KMeans(ClusteringCatalog+ClusteringTrainers, KMeansTrainer+Options)

를 사용하여 KMeansTrainerKMeans++ 클러스터링 알고리즘을 학습시킵니다.

public static Microsoft.ML.Trainers.KMeansTrainer KMeans (this Microsoft.ML.ClusteringCatalog.ClusteringTrainers catalog, Microsoft.ML.Trainers.KMeansTrainer.Options options);
static member KMeans : Microsoft.ML.ClusteringCatalog.ClusteringTrainers * Microsoft.ML.Trainers.KMeansTrainer.Options -> Microsoft.ML.Trainers.KMeansTrainer
<Extension()>
Public Function KMeans (catalog As ClusteringCatalog.ClusteringTrainers, options As KMeansTrainer.Options) As KMeansTrainer

매개 변수

catalog
ClusteringCatalog.ClusteringTrainers

클러스터링 카탈로그 트레이너 개체입니다.

options
KMeansTrainer.Options

알고리즘 고급 옵션입니다.

반환

예제

using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;

namespace Samples.Dynamic.Trainers.Clustering
{
    public static class KMeansWithOptions
    {
        public static void Example()
        {
            // Create a new context for ML.NET operations. It can be used for
            // exception tracking and logging, as a catalog of available operations
            // and as the source of randomness. Setting the seed to a fixed number
            // in this example to make outputs deterministic.
            var mlContext = new MLContext(seed: 0);

            // Create a list of training data points.
            var dataPoints = GenerateRandomDataPoints(1000, 0);

            // Convert the list of data points to an IDataView object, which is
            // consumable by ML.NET API.
            IDataView trainingData = mlContext.Data.LoadFromEnumerable(dataPoints);

            // Define trainer options.
            var options = new KMeansTrainer.Options
            {
                NumberOfClusters = 2,
                OptimizationTolerance = 1e-6f,
                NumberOfThreads = 1
            };

            // Define the trainer.
            var pipeline = mlContext.Clustering.Trainers.KMeans(options);

            // Train the model.
            var model = pipeline.Fit(trainingData);

            // Create testing data. Use a different random seed to make it different
            // from the training data.
            var testData = mlContext.Data.LoadFromEnumerable(
                GenerateRandomDataPoints(500, seed: 123));

            // Run the model on test data set.
            var transformedTestData = model.Transform(testData);

            // Convert IDataView object to a list.
            var predictions = mlContext.Data.CreateEnumerable<Prediction>(
                transformedTestData, reuseRowObject: false).ToList();

            // Print 5 predictions. Note that the label is only used as a comparison
            // with the predicted label. It is not used during training.
            foreach (var p in predictions.Take(2))
                Console.WriteLine(
                    $"Label: {p.Label}, Prediction: {p.PredictedLabel}");

            foreach (var p in predictions.TakeLast(3))
                Console.WriteLine(
                    $"Label: {p.Label}, Prediction: {p.PredictedLabel}");

            // Expected output:
            //   Label: 1, Prediction: 1
            //   Label: 1, Prediction: 1
            //   Label: 2, Prediction: 2
            //   Label: 2, Prediction: 2
            //   Label: 2, Prediction: 2

            // Evaluate the overall metrics
            var metrics = mlContext.Clustering.Evaluate(
                transformedTestData, "Label", "Score", "Features");

            PrintMetrics(metrics);

            // Expected output:
            //   Normalized Mutual Information: 0.92
            //   Average Distance: 4.18
            //   Davies Bouldin Index: 2.87

            // Get the cluster centroids and the number of clusters k from
            // KMeansModelParameters.
            VBuffer<float>[] centroids = default;

            var modelParams = model.Model;
            modelParams.GetClusterCentroids(ref centroids, out int k);
            Console.WriteLine(
                $"The first 3 coordinates of the first centroid are: " +
                string.Join(", ", centroids[0].GetValues().ToArray().Take(3)));

            Console.WriteLine(
                $"The first 3 coordinates of the second centroid are: " +
                string.Join(", ", centroids[1].GetValues().ToArray().Take(3)));

            // Expected output:
            //   The first 3 coordinates of the first centroid are: (0.5840713, 0.5678288, 0.6221277)
            //   The first 3 coordinates of the second centroid are: (0.3705794, 0.4289133, 0.4001645)
        }

        private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count,
            int seed = 0)
        {
            var random = new Random(seed);
            float randomFloat() => (float)random.NextDouble();
            for (int i = 0; i < count; i++)
            {
                int label = i < count / 2 ? 0 : 1;
                yield return new DataPoint
                {
                    Label = (uint)label,
                    // Create random features with two clusters.
                    // The first half has feature values centered around 0.6, while
                    // the second half has values centered around 0.4.
                    Features = Enumerable.Repeat(label, 50)
                        .Select(index => label == 0 ? randomFloat() + 0.1f :
                            randomFloat() - 0.1f).ToArray()
                };
            }
        }

        // Example with label and 50 feature values. A data set is a collection of
        // such examples.
        private class DataPoint
        {
            // The label is not used during training, just for comparison with the
            // predicted label.
            [KeyType(2)]
            public uint Label { get; set; }

            [VectorType(50)]
            public float[] Features { get; set; }
        }

        // Class used to capture predictions.
        private class Prediction
        {
            // Original label (not used during training, just for comparison).
            public uint Label { get; set; }
            // Predicted label from the trainer.
            public uint PredictedLabel { get; set; }
        }

        // Pretty-print of ClusteringMetrics object.
        private static void PrintMetrics(ClusteringMetrics metrics)
        {
            Console.WriteLine($"Normalized Mutual Information: " +
                $"{metrics.NormalizedMutualInformation:F2}");

            Console.WriteLine($"Average Distance: " +
                $"{metrics.AverageDistance:F2}");

            Console.WriteLine($"Davies Bouldin Index: " +
                $"{metrics.DaviesBouldinIndex:F2}");
        }
    }
}

적용 대상

KMeans(ClusteringCatalog+ClusteringTrainers, String, String, Int32)

를 사용하여 KMeansTrainerKMeans++ 클러스터링 알고리즘을 학습시킵니다.

public static Microsoft.ML.Trainers.KMeansTrainer KMeans (this Microsoft.ML.ClusteringCatalog.ClusteringTrainers catalog, string featureColumnName = "Features", string exampleWeightColumnName = default, int numberOfClusters = 5);
static member KMeans : Microsoft.ML.ClusteringCatalog.ClusteringTrainers * string * string * int -> Microsoft.ML.Trainers.KMeansTrainer
<Extension()>
Public Function KMeans (catalog As ClusteringCatalog.ClusteringTrainers, Optional featureColumnName As String = "Features", Optional exampleWeightColumnName As String = Nothing, Optional numberOfClusters As Integer = 5) As KMeansTrainer

매개 변수

catalog
ClusteringCatalog.ClusteringTrainers

클러스터링 카탈로그 트레이너 개체입니다.

featureColumnName
String

기능 열의 이름입니다.

exampleWeightColumnName
String

예제 가중치 열의 이름(선택 사항)입니다.

numberOfClusters
Int32

KMeans에 사용할 클러스터 수입니다.

반환

예제

using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.ML;
using Microsoft.ML.Data;

namespace Samples.Dynamic.Trainers.Clustering
{
    public static class KMeans
    {
        public static void Example()
        {
            // Create a new context for ML.NET operations. It can be used for
            // exception tracking and logging, as a catalog of available operations
            // and as the source of randomness. Setting the seed to a fixed number
            // in this example to make outputs deterministic.
            var mlContext = new MLContext(seed: 0);

            // Create a list of training data points.
            var dataPoints = GenerateRandomDataPoints(1000, 123);

            // Convert the list of data points to an IDataView object, which is
            // consumable by ML.NET API.
            IDataView trainingData = mlContext.Data.LoadFromEnumerable(dataPoints);

            // Define the trainer.
            var pipeline = mlContext.Clustering.Trainers.KMeans(
                numberOfClusters: 2);

            // Train the model.
            var model = pipeline.Fit(trainingData);

            // Create testing data. Use a different random seed to make it different
            // from the training data.
            var testData = mlContext.Data.LoadFromEnumerable(
                GenerateRandomDataPoints(500, seed: 123));

            // Run the model on test data set.
            var transformedTestData = model.Transform(testData);

            // Convert IDataView object to a list.
            var predictions = mlContext.Data.CreateEnumerable<Prediction>(
                transformedTestData, reuseRowObject: false).ToList();

            // Print 5 predictions. Note that the label is only used as a comparison
            // with the predicted label. It is not used during training.
            foreach (var p in predictions.Take(2))
                Console.WriteLine(
                    $"Label: {p.Label}, Prediction: {p.PredictedLabel}");

            foreach (var p in predictions.TakeLast(3))
                Console.WriteLine(
                    $"Label: {p.Label}, Prediction: {p.PredictedLabel}");

            // Expected output:
            //   Label: 1, Prediction: 1
            //   Label: 1, Prediction: 1
            //   Label: 2, Prediction: 2
            //   Label: 2, Prediction: 2
            //   Label: 2, Prediction: 2

            // Evaluate the overall metrics
            var metrics = mlContext.Clustering.Evaluate(
                transformedTestData, "Label", "Score", "Features");

            PrintMetrics(metrics);

            // Expected output:
            //   Normalized Mutual Information: 0.95
            //   Average Distance: 4.17
            //   Davies Bouldin Index: 2.87

            // Get the cluster centroids and the number of clusters k from
            // KMeansModelParameters.
            VBuffer<float>[] centroids = default;

            var modelParams = model.Model;
            modelParams.GetClusterCentroids(ref centroids, out int k);
            Console.WriteLine(
                $"The first 3 coordinates of the first centroid are: " +
                string.Join(", ", centroids[0].GetValues().ToArray().Take(3)));

            Console.WriteLine(
                $"The first 3 coordinates of the second centroid are: " +
                string.Join(", ", centroids[1].GetValues().ToArray().Take(3)));

            // Expected output similar to:
            //   The first 3 coordinates of the first centroid are: (0.6035213, 0.6017533, 0.5964218)
            //   The first 3 coordinates of the second centroid are: (0.4031044, 0.4175443, 0.4082336)
        }

        private static IEnumerable<DataPoint> GenerateRandomDataPoints(int count,
            int seed = 0)
        {
            var random = new Random(seed);
            float randomFloat() => (float)random.NextDouble();
            for (int i = 0; i < count; i++)
            {
                int label = i < count / 2 ? 0 : 1;
                yield return new DataPoint
                {
                    Label = (uint)label,
                    // Create random features with two clusters.
                    // The first half has feature values centered around 0.6, while
                    // the second half has values centered around 0.4.
                    Features = Enumerable.Repeat(label, 50)
                        .Select(index => label == 0 ? randomFloat() + 0.1f :
                            randomFloat() - 0.1f).ToArray()
                };
            }
        }

        // Example with label and 50 feature values. A data set is a collection of
        // such examples.
        private class DataPoint
        {
            // The label is not used during training, just for comparison with the
            // predicted label.
            [KeyType(2)]
            public uint Label { get; set; }

            [VectorType(50)]
            public float[] Features { get; set; }
        }

        // Class used to capture predictions.
        private class Prediction
        {
            // Original label (not used during training, just for comparison).
            public uint Label { get; set; }
            // Predicted label from the trainer.
            public uint PredictedLabel { get; set; }
        }

        // Pretty-print of ClusteringMetrics object.
        private static void PrintMetrics(ClusteringMetrics metrics)
        {
            Console.WriteLine($"Normalized Mutual Information: " +
                $"{metrics.NormalizedMutualInformation:F2}");

            Console.WriteLine($"Average Distance: " +
                $"{metrics.AverageDistance:F2}");

            Console.WriteLine($"Davies Bouldin Index: " +
                $"{metrics.DaviesBouldinIndex:F2}");
        }
    }
}

적용 대상