I am training to train my model in the machine learning studio. When I train on the computing instance Standard-NC6 everything works as expected, but when I change to a more powerful instance Standard-NC64as-T4-v3
I get the following error:
Execution failed. User process 'python' terminated by signal with name SIGBUS. This could be caused by an out of memory error. Please check log file 'user_logs/std_log.txt' for error detail
I cannot be due to memory because the new instance has more memory.
Thanks for your help!!
Code:
import argparse
import pandas as pd
import numpy as np
from neuralprophet import NeuralProphet, set_log_level, save
import mlflow
import os
import time
from azureml.core import Run
import logging
from error_metrics import _calc_mae, _calc_mse, _calc_rmse, _calc_nrmse, _calc_mape, _calc_mase, _calc_msse, _seas_naive_fcst, _calc_metrics
def main():
"""Main function of the script."""
# input arguments
parser = argparse.ArgumentParser()
parser.add_argument("--data_train", type=str, help="path to input train data")
parser.add_argument("--data_test", type=str, help="path to input test data")
parser.add_argument("--data_val", type=str, help="path to input validation data")
parser.add_argument("--name", type=str, help="name of the output")
parser.add_argument("--num_nodes", type=str, help="number of nodes")
#output arguments
parser.add_argument("--results", type=str, help="path to results")
args = parser.parse_args()
##################
#<load data>
##################
start = time.time()
# log starting time
logging.info(f'starting time: {start}')
print(f'starting time: {start}')
df_train = pd.read_csv(args.data_train, parse_dates=['ds'])
df_train['y'] = pd.to_numeric(df_train['y'], errors='coerce')
df_val = pd.read_csv(args.data_val, parse_dates=['ds'])
df_val['y'] = pd.to_numeric(df_val['y'], errors='coerce')
df_test = pd.read_csv(args.data_test, parse_dates=['ds'])
df_test['y'] = pd.to_numeric(df_test['y'], errors='coerce')
# limit to first 200 IDs
IDs = df_train['ID'].unique()[:200]
df_train = df_train[df_train['ID'].isin(IDs)]
df_val = df_val[df_val['ID'].isin(IDs)]
df_test = df_test[df_test['ID'].isin(IDs)]
print(f'Number of IDs in train: {df_train["ID"].nunique()}, val: {df_val["ID"].nunique()}, test: {df_test["ID"].nunique()}')
##################
#<train the model>
##################
tuned_params = {
'n_lags':240,
'newer_samples_weight': 6.8,
'n_changepoints': 2,
'yearly_seasonality': 3,
'weekly_seasonality': 4,
'daily_seasonality': 2,
'trend_global_local': 'global',
'season_global_local': 'local',
'batch_size': 1024,
'ar_layers': [24, 12, 24],
# not tuned
'n_forecasts': 33,
#'learning_rate': 0.01,
'epochs': 20,
'drop_missing': True,
}
trainer_configs = {
'accelerator': 'gpu',
'trainer_config': {
'num_nodes': args.num_nodes,
#'precision': 16,
}
}
mlflow.pytorch.autolog(log_every_n_epoch=1, log_models=False, log_datasets=False)
# build and train the model with these hyper-parameters
m = NeuralProphet(**tuned_params, **trainer_configs)
set_log_level("INFO")
print('start training')
with mlflow.start_run() as run:
metrics = m.fit(df = df_train, validation_df = df_val, freq="H", num_workers=4, early_stopping=True)
forecasts_val = m.predict(df_val)
forecasts_test = m.predict(df_test)
end = time.time()
##################
#<evaluate the model>
##################
df_metrics_all_LBAs_val, df_metrics_all_LBAs_averaged_val = _calc_metrics(forecasts=forecasts_val, start_forecast=10, n_forecasts=m.n_forecasts, metric_names=['RMSE', 'MAE', 'MAPE', 'MASE', 'MSSE'])
df_metrics_all_LBAs_test, df_metrics_all_LBAs_averaged_test = _calc_metrics(forecasts=forecasts_test, start_forecast=10, n_forecasts=m.n_forecasts, metric_names=['RMSE', 'MAE', 'MAPE', 'MASE', 'MSSE'])
##########################
#<save forecasts and model>
##########################
# log parameters
run_id = Run.get_context(allow_offline=True).id
with mlflow.start_run(run_id):
# log parameters
mlflow.log_params(tuned_params)
mlflow.log_params(trainer_configs)
# log metrics
mlflow.log_metric('duration', end-start)
mlflow.log_metric("MAE_train", list(metrics['MAE'])[-1])
mlflow.log_metric("RMSE_train", list(metrics['RMSE'])[-1])
mlflow.log_metric("Loss_train", list(metrics['Loss'])[-1])
mlflow.log_metric("MAE_val", list(metrics['MAE_val'])[-1])
mlflow.log_metric("RMSE_val", list(metrics['RMSE_val'])[-1])
mlflow.log_metric("Loss_val", list(metrics['Loss_val'])[-1])
mlflow.log_metric("RMSE_val_final", df_metrics_all_LBAs_averaged_val['RMSE'][0])
mlflow.log_metric("MAE_val_final", df_metrics_all_LBAs_averaged_val['MAE'][0])
mlflow.log_metric("MAPE_val_final", df_metrics_all_LBAs_averaged_val['MAPE'][0])
mlflow.log_metric("MASE_val_final", df_metrics_all_LBAs_averaged_val['MASE'][0])
mlflow.log_metric("MSSE_val_final", df_metrics_all_LBAs_averaged_val['MSSE'][0])
mlflow.log_metric("RMSE_test_final", df_metrics_all_LBAs_averaged_test['RMSE'][0])
mlflow.log_metric("MAE_test_final", df_metrics_all_LBAs_averaged_test['MAE'][0])
mlflow.log_metric("MAPE_test_final", df_metrics_all_LBAs_averaged_test['MAPE'][0])
mlflow.log_metric("MASE_test_final", df_metrics_all_LBAs_averaged_test['MASE'][0])
mlflow.log_metric("MSSE_test_final", df_metrics_all_LBAs_averaged_test['MSSE'][0])
# save forecasts
forecasts_val.to_csv(os.path.join(args.results, "forecasts_forecast_val.csv"), index=False)
forecasts_test.to_csv(os.path.join(args.results, "forecasts_forecast_test.csv"), index=False)
# save model
save(m, os.path.join(args.results, f"model.np"))
# save metrics
metrics.to_csv(os.path.join(args.results, "metrics_metrics_train.csv"), index=False)
df_metrics_all_LBAs_val.to_csv(os.path.join(args.results, "metrics_metrics_val.csv"), index=False)
df_metrics_all_LBAs_test.to_csv(os.path.join(args.results, "metrics_metrics_test.csv"), index=False)
if __name__ == "__main__":
main()