I cannot re-use a workable code in 2021 to predict from LbfgsMaximumEntropy taining
Hello this piece of code is not working anymore, it was running well in 2021.
Current setup :
Microsoft Visual Studio Professional 2022 (64-bit) - LTSC 17.4
Version 17.4.21
Windows 10 Entreprise LTSC 21H2
crash when use Training code at ligne 223 :
System.ArgumentOutOfRangeException: 'Could not find input column 'Label'
Nom du paramètre : inputSchema'
codes and input csv file :
////////////////////////////////////////////////
// Predict
////////////////////////////////////////////////
using Microsoft.ML.Data;
using Microsoft.ML;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace PredictCategorie
{
public class InputObject
{
[LoadColumn(0)]
public string CatSWM { get; set; }
[LoadColumn(1)]
public string ArticleFR { get; set; }
[LoadColumn(2)]
[ColumnName("Label")]
public float IDCategorie { get; set; }
}
/////////////// from Example BEGIN
public class InputObjectDataView : IDataView
{
private readonly IEnumerable<InputObject> _data;
public IEnumerable<InputObject> Data
{
get
{
return _data;
}
}
public DataViewSchema Schema { get; }
public bool CanShuffle => false;
public InputObjectDataView(IEnumerable<InputObject> data)
{
_data = data;
var builder = new DataViewSchema.Builder();
builder.AddColumn("CatSWM", TextDataViewType.Instance);
builder.AddColumn("ArticleFR", TextDataViewType.Instance);
builder.AddColumn("IDCategorie", NumberDataViewType.Single);
Schema = builder.ToSchema();
}
public long? GetRowCount() => null;
public DataViewRowCursor GetRowCursor(
IEnumerable<DataViewSchema.Column> columnsNeeded,
Random rand = null)
=> new Cursor(this, columnsNeeded.Any(c => c.Index == 0),
columnsNeeded.Any(c => c.Index == 1), columnsNeeded.Any(c => c.Index == 2));
public DataViewRowCursor[] GetRowCursorSet(
IEnumerable<DataViewSchema.Column> columnsNeeded, int n,
Random rand = null)
=> new[] { GetRowCursor(columnsNeeded, rand) };
public class Cursor : DataViewRowCursor
{
private bool _disposed;
private long _position;
private readonly IEnumerator<InputObject> _enumerator;
private readonly Delegate[] _getters;
public override long Position => _position;
public override long Batch => 0;
public override DataViewSchema Schema { get; }
public Cursor(InputObjectDataView parent, bool wantsCatSWM, bool wantsArticleFR, bool wantsIDCategorie)
{
Schema = parent.Schema;
_position = -1;
_enumerator = parent.Data.GetEnumerator();
_getters = new Delegate[]
{
wantsCatSWM ?
(ValueGetter<ReadOnlyMemory<char>>)
Text2GetterImplementation : null,
wantsArticleFR ?
(ValueGetter<ReadOnlyMemory<char>>)
Text1GetterImplementation : null,
wantsIDCategorie ?
(ValueGetter<float>)
Text3GetterImplementation : null
};
}
protected override void Dispose(bool disposing)
{
if (_disposed)
return;
if (disposing)
{
_enumerator.Dispose();
_position = -1;
}
_disposed = true;
base.Dispose(disposing);
}
private void Text2GetterImplementation(ref ReadOnlyMemory<char> value)
=> value = _enumerator.Current.CatSWM.AsMemory();
private void Text1GetterImplementation(ref ReadOnlyMemory<char> value)
=> value = _enumerator.Current.ArticleFR.AsMemory();
private void Text3GetterImplementation(ref float value)
=> value = _enumerator.Current.IDCategorie;
private void IdGetterImplementation(ref DataViewRowId id)
=> id = new DataViewRowId((ulong)_position, 0);
public override ValueGetter<TValue> GetGetter<TValue>(
DataViewSchema.Column column)
{
if (!IsColumnActive(column))
throw new ArgumentOutOfRangeException(nameof(column));
return (ValueGetter<TValue>)_getters[column.Index];
}
public override ValueGetter<DataViewRowId> GetIdGetter()
=> IdGetterImplementation;
public override bool IsColumnActive(DataViewSchema.Column column)
=> _getters[column.Index] != null;
public override bool MoveNext()
{
if (_disposed)
return false;
if (_enumerator.MoveNext())
{
_position++;
return true;
}
Dispose();
return false;
}
}
}
/////////////// from Example END
class Program
{
static void Main(string[] args)
{
Console.WriteLine("On va trouver la catégorie!");
// path and file location definition
string file_path = new DirectoryInfo(Environment.CurrentDirectory).Parent.Parent.Parent.Parent.FullName + @"\MLwork\categorie\";
string file_name = "cat_TestOpenIndexOnly_classification_04.10.2024";
// file to write
string filename_out = $"{file_path}{file_name}_out.csv";
if (File.Exists(filename_out))
{
File.Delete(filename_out);
}
using StreamWriter sw = File.CreateText(filename_out);
// the 1st line
sw.WriteLine("CatSWM;ArticleFR;IDcategorie;LibelleCategorie;SMP");
// file to open
Console.WriteLine("Read input file");
string filename_in = $"{file_path}{file_name}.csv";
string[] lines = System.IO.File.ReadAllLines(filename_in);
// read file and store
List<string> art = new List<string>();
List<string> cat = new List<string>();
int count = 0;
foreach (string line in lines)
{
string[] line_elements = line.Split(';');
if (line_elements[0] != "CatSWM")
{
art.Add(line_elements[0]);
cat.Add(line_elements[1]);
count++;
}
}
// https://docs.microsoft.com/fr-fr/dotnet/machine-learning/how-to-guides/save-load-machine-learning-models-ml-net
//Create MLContext
MLContext mlContext = new MLContext();
// Define data preparation and trained model schemas
DataViewSchema dataPrepPipelineSchema, modelSchema;
// Load data preparation pipeline
Console.WriteLine("Load data preparation pipeline");
string data_prep_name = "\\files\\data_preparation_pipeline_categorie";
string data_prep_file = $"{file_path}{data_prep_name}.zip";
ITransformer dataPrepPipeline = mlContext.Model.Load(data_prep_file, out dataPrepPipelineSchema);
// Load Trained Model
Console.WriteLine("Load Trained Model");
//string model_name = "\\ML_categorie";
string model_name = "\\files\\model_lbfgs";
string model_file = $"{file_path}{model_name}.zip";
ITransformer trainedModel = mlContext.Model.Load(model_file, out modelSchema);
Console.WriteLine("Load IDataView");
List<InputObject> categorieData = new List<InputObject>();
for (int i = 0; i < count; i++)
{
categorieData.Add(new InputObject { CatSWM = cat[i], ArticleFR = art[i] });
}
var inputData = new InputObjectDataView(categorieData);
// Predicted Data
Console.WriteLine("Predict");
IDataView predictions = trainedModel.Transform(inputData);
float[] scoreColumn = predictions.GetColumn<float>("PredictedLabel").ToArray();
/////////////////////////////
// output
Console.WriteLine("Write ouptput");
string line_out = "";
for (int i = 0; i < count; i++)
{
if (i % 500 == 0) { Console.WriteLine($"{i}/{count} DONE"); }
line_out = $"{art[i]};{cat[i]};{scoreColumn[i]}";
//Console.WriteLine(line_out);
sw.WriteLine(line_out);
}
}
}
}
////////////////////////////////////////////////
// Train
////////////////////////////////////////////////
using Microsoft.ML.Data;
using Microsoft.ML.Trainers;
using Microsoft.ML;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace TrainCategorie
{
public class InputObject
{
[LoadColumn(0)]
public string CatSWM { get; set; }
[LoadColumn(1)]
public string ArticleFR { get; set; }
[LoadColumn(2)]
[ColumnName("Label")]
public float IDCategorie { get; set; }
}
class Program
{
static void Main(string[] args)
{
Console.WriteLine("On va entraîner la catégorie!!");
// path and file location definition
//string file_path = @"\\int.ofac.ch\OFAC\Collaborateurs\jamin\Visual Studio 2019\MLwork\categorie\files";
string file_path = new DirectoryInfo(Environment.CurrentDirectory).Parent.Parent.Parent.Parent.FullName + @"\MLwork\categorie\files";
string file_name = "cat_TrainOpenIndex_classification_04.10.2024";
// file to open
Console.WriteLine("Read input file");
string filename_in = $"{file_path}\\{file_name}.csv";
// https://docs.microsoft.com/fr-fr/dotnet/machine-learning/how-to-guides/load-data-ml-net
//Create MLContext
MLContext mlContext = new MLContext();
// Load Trained Model
Console.WriteLine("Load Pipeline");
IDataView raw_data = mlContext.Data.LoadFromTextFile<InputObject>(filename_in, separatorChar: ';', hasHeader: true);
//https://docs.microsoft.com/en-us/dotnet/machine-learning/how-to-guides/prepare-data-ml-net
Console.WriteLine("Convert str->float");
// 1
// Define text transform estimator
var textEstimator1 = mlContext.Transforms.Text.FeaturizeText("ArticleFR");
var textEstimator2 = mlContext.Transforms.Text.FeaturizeText("CatSWM");
// Fit data to estimator
// Fitting generates a transformer that applies the operations of defined by estimator
ITransformer textTransformer1 = textEstimator1.Fit(raw_data);
ITransformer textTransformer2 = textEstimator2.Fit(raw_data);
var fullTransformer = textTransformer1.Append(textTransformer2);
// Transform data
IDataView data = fullTransformer.Transform(raw_data);
//https://docs.microsoft.com/fr-fr/dotnet/machine-learning/how-to-guides/train-machine-learning-model-ml-net
//DataOperationsCatalog.TrainTestData dataSplit = mlContext.Data.TrainTestSplit(data, testFraction: 0.1);
//IDataView trainData = dataSplit.TrainSet;
//IDataView testData = dataSplit.TestSet;
IDataView trainData = data;
IDataView testData = data;
// Define Data Prep Estimator
// 1. Concatenate Size and Historical into a single feature vector output to a new column called Features
// 2. Normalize Features vector
Console.WriteLine("Concatenate Features");
IEstimator<ITransformer> dataPrepEstimator =
mlContext.Transforms.Concatenate("Features", "ArticleFR", "CatSWM")
.Append(mlContext.Transforms.NormalizeMinMax("Features"));
// Create data prep transformer
ITransformer dataPrepTransformer = dataPrepEstimator.Fit(trainData);
// Apply transforms to data
Console.WriteLine("Prepare Train Data");
IDataView transformedTrainingData = dataPrepTransformer.Transform(trainData);
Console.WriteLine("Prepare Test Data");
IDataView transformedTestData = dataPrepTransformer.Transform(testData);
// tuto trainer
//Console.WriteLine("Train Sdca");
//TrainSdca(mlContext, transformedTrainingData, transformedTestData);
// Lbfgs needed Trainer
Console.WriteLine("Train Lbfgs");
TrainLbfgs(mlContext, transformedTrainingData, transformedTestData, file_path);
// Save Data Prep transformer
Console.WriteLine("Save Data Prep transformer");
mlContext.Model.Save(dataPrepTransformer, trainData.Schema, $"{file_path}\\data_preparation_pipeline_categorie.zip");
}
private static void TrainSdca(MLContext mlContext, IDataView transformedTrainingData, IDataView transformedTestData)
{
// Define StochasticDualCoordinateAscent regression algorithm estimator
Console.WriteLine("Build ML");
var sdcaEstimator = mlContext.Regression.Trainers.Sdca();
// Build machine learning model
Console.WriteLine("Train ML");
var trainedModel = sdcaEstimator.Fit(transformedTrainingData);
// extract model parameters
//var trainedModelParameters = trainedModel.Model as LinearRegressionModelParameters;
// Measure trained model performance
// Use trained model to make inferences on test data
IDataView testDataPredictions = trainedModel.Transform(transformedTestData);
// Extract model metrics and get RSquared
Console.WriteLine("Evaluate Test Data");
RegressionMetrics trainedModelMetrics = mlContext.Regression.Evaluate(testDataPredictions);
double rSquared = trainedModelMetrics.RSquared;
Console.WriteLine($"rSquared={rSquared}");
// save model
//mlContext.Model.Save(trainedModel, data.Schema, $"{file_path}\\testmodel.zip");
}
private static void TrainLbfgs(MLContext mlContext, IDataView transformedTrainingData, IDataView transformedTestData, string file_path)
{
// https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.standardtrainerscatalog.lbfgslogisticregression?view=ml-dotnet
// https://docs.microsoft.com/en-us/dotnet/api/microsoft.ml.standardtrainerscatalog.lbfgsmaximumentropy?view=ml-dotnet#Microsoft_ML_StandardTrainersCatalog_LbfgsMaximumEntropy_Microsoft_ML_MulticlassClassificationCatalog_MulticlassClassificationTrainers_System_String_System_String_System_String_System_Single_System_Single_System_Single_System_Int32_System_Boolean_
Console.WriteLine("Build ML");
var options = new LbfgsMaximumEntropyMulticlassTrainer.Options()
{
//LabelColumnName = "IDCategorie",
//FeatureColumnName = "Features",
HistorySize = 50,
L1Regularization = 0.1f,
NumberOfThreads = 1
};
// Define the trainer.
var pipeline =
// Convert the string labels into key types.
mlContext.Transforms.Conversion.MapValueToKey("Label")
// Apply LbfgsMaximumEntropy multiclass trainer.
.Append(mlContext.MulticlassClassification.Trainers
//.LbfgsMaximumEntropy(options));
.LbfgsMaximumEntropy());
// Train the model.
Console.WriteLine("Train ML");
var trainedModel = pipeline.Fit(transformedTrainingData);
// Use trained model to make inferences on test data
Console.WriteLine("transform trained model");
IDataView testDataPredictions = trainedModel.Transform(transformedTestData);
// Extract model metrics and get accuracy
Console.WriteLine("Evaluate Test Data");
var trainedModelMetrics = mlContext.MulticlassClassification.Evaluate(testDataPredictions);
double accuracy = trainedModelMetrics.MicroAccuracy;
Console.WriteLine($"accuracy={accuracy}");
// Save Trained Model
Console.WriteLine("Save Trained Model");
mlContext.Model.Save(trainedModel, transformedTrainingData.Schema, $"{file_path}\\model_lbfgs.zip");
}
}
}
////////////////////////////////////////////////
// csv file
////////////////////////////////////////////////
CatSWM;ArticleFR;IDCategorie
cat1;name1;1
;name2;2
cat3;nam3;3