How to fix SIGBUS error?

Leonie Wagner 0 Reputation points
2023-08-22T23:20:29.9066667+00:00

I am training to train my model in the machine learning studio. When I train on the computing instance Standard-NC6 everything works as expected, but when I change to a more powerful instance Standard-NC64as-T4-v3
I get the following error:

Execution failed. User process 'python' terminated by signal with name SIGBUS. This could be caused by an out of memory error. Please check log file 'user_logs/std_log.txt' for error detail

I cannot be due to memory because the new instance has more memory.

Thanks for your help!!

Code:

import argparse
import pandas as pd
import numpy as np
from neuralprophet import NeuralProphet, set_log_level, save
import mlflow
import os
import time
from azureml.core import Run
import logging


from error_metrics import _calc_mae, _calc_mse, _calc_rmse, _calc_nrmse, _calc_mape, _calc_mase, _calc_msse, _seas_naive_fcst, _calc_metrics

def main():
    """Main function of the script."""

    # input arguments
    parser = argparse.ArgumentParser()
    parser.add_argument("--data_train", type=str, help="path to input train data")
    parser.add_argument("--data_test", type=str, help="path to input test data")
    parser.add_argument("--data_val", type=str, help="path to input validation data")
    parser.add_argument("--name", type=str, help="name of the output")
    parser.add_argument("--num_nodes", type=str, help="number of nodes")
    
    #output arguments
    parser.add_argument("--results", type=str, help="path to results")

    args = parser.parse_args()

    ##################
    #<load data>
    ##################
    start = time.time()

    # log starting time
    logging.info(f'starting time: {start}')

    print(f'starting time: {start}')

    df_train = pd.read_csv(args.data_train, parse_dates=['ds'])
    df_train['y'] = pd.to_numeric(df_train['y'], errors='coerce')

    df_val = pd.read_csv(args.data_val, parse_dates=['ds'])
    df_val['y'] = pd.to_numeric(df_val['y'], errors='coerce')

    df_test = pd.read_csv(args.data_test, parse_dates=['ds'])
    df_test['y'] = pd.to_numeric(df_test['y'], errors='coerce')

    # limit to first 200 IDs
    IDs = df_train['ID'].unique()[:200]
    df_train = df_train[df_train['ID'].isin(IDs)]
    df_val = df_val[df_val['ID'].isin(IDs)]
    df_test = df_test[df_test['ID'].isin(IDs)]

    print(f'Number of IDs in train: {df_train["ID"].nunique()}, val: {df_val["ID"].nunique()}, test: {df_test["ID"].nunique()}')



    ##################
    #<train the model>
    ##################

    tuned_params = {
            'n_lags':240,
            'newer_samples_weight': 6.8,
            'n_changepoints': 2,
            'yearly_seasonality': 3,
            'weekly_seasonality': 4,
            'daily_seasonality': 2,
            'trend_global_local': 'global',
            'season_global_local': 'local',
            'batch_size': 1024,
            'ar_layers': [24, 12, 24],

            # not tuned
            'n_forecasts': 33,
            #'learning_rate': 0.01,
            'epochs': 20, 
            'drop_missing': True,
        }

    trainer_configs = {
        'accelerator': 'gpu',
        'trainer_config': {
            'num_nodes': args.num_nodes,
            #'precision': 16,
        }
    }

    mlflow.pytorch.autolog(log_every_n_epoch=1, log_models=False, log_datasets=False)

    # build and train the model with these hyper-parameters
    m = NeuralProphet(**tuned_params, **trainer_configs)

    set_log_level("INFO")

    print('start training')
    with mlflow.start_run() as run:
        metrics = m.fit(df = df_train, validation_df = df_val, freq="H", num_workers=4, early_stopping=True)
        forecasts_val = m.predict(df_val)
        forecasts_test = m.predict(df_test)

    end = time.time()


    ##################
    #<evaluate the model>
    ##################

    df_metrics_all_LBAs_val, df_metrics_all_LBAs_averaged_val = _calc_metrics(forecasts=forecasts_val, start_forecast=10, n_forecasts=m.n_forecasts, metric_names=['RMSE', 'MAE', 'MAPE', 'MASE', 'MSSE'])
    df_metrics_all_LBAs_test, df_metrics_all_LBAs_averaged_test = _calc_metrics(forecasts=forecasts_test, start_forecast=10, n_forecasts=m.n_forecasts, metric_names=['RMSE', 'MAE', 'MAPE', 'MASE', 'MSSE'])

    
    ##########################
    #<save forecasts and model>
    ##########################

    # log parameters
    run_id = Run.get_context(allow_offline=True).id
    
    with mlflow.start_run(run_id):
        # log parameters
        mlflow.log_params(tuned_params)
        mlflow.log_params(trainer_configs)

        # log metrics
        mlflow.log_metric('duration', end-start)

        mlflow.log_metric("MAE_train", list(metrics['MAE'])[-1])
        mlflow.log_metric("RMSE_train", list(metrics['RMSE'])[-1])
        mlflow.log_metric("Loss_train", list(metrics['Loss'])[-1])

        mlflow.log_metric("MAE_val", list(metrics['MAE_val'])[-1])
        mlflow.log_metric("RMSE_val", list(metrics['RMSE_val'])[-1])
        mlflow.log_metric("Loss_val", list(metrics['Loss_val'])[-1])

        mlflow.log_metric("RMSE_val_final", df_metrics_all_LBAs_averaged_val['RMSE'][0])
        mlflow.log_metric("MAE_val_final", df_metrics_all_LBAs_averaged_val['MAE'][0])
        mlflow.log_metric("MAPE_val_final", df_metrics_all_LBAs_averaged_val['MAPE'][0])
        mlflow.log_metric("MASE_val_final", df_metrics_all_LBAs_averaged_val['MASE'][0])
        mlflow.log_metric("MSSE_val_final", df_metrics_all_LBAs_averaged_val['MSSE'][0])

        mlflow.log_metric("RMSE_test_final", df_metrics_all_LBAs_averaged_test['RMSE'][0])
        mlflow.log_metric("MAE_test_final", df_metrics_all_LBAs_averaged_test['MAE'][0])
        mlflow.log_metric("MAPE_test_final", df_metrics_all_LBAs_averaged_test['MAPE'][0])
        mlflow.log_metric("MASE_test_final", df_metrics_all_LBAs_averaged_test['MASE'][0])
        mlflow.log_metric("MSSE_test_final", df_metrics_all_LBAs_averaged_test['MSSE'][0])

    # save forecasts
    forecasts_val.to_csv(os.path.join(args.results, "forecasts_forecast_val.csv"), index=False)
    forecasts_test.to_csv(os.path.join(args.results, "forecasts_forecast_test.csv"), index=False)

    # save model
    save(m, os.path.join(args.results, f"model.np"))

    # save metrics
    metrics.to_csv(os.path.join(args.results, "metrics_metrics_train.csv"), index=False)
    df_metrics_all_LBAs_val.to_csv(os.path.join(args.results, "metrics_metrics_val.csv"), index=False)
    df_metrics_all_LBAs_test.to_csv(os.path.join(args.results, "metrics_metrics_test.csv"), index=False)

    

if __name__ == "__main__":
    main()

Azure Machine Learning
Azure Machine Learning
An Azure machine learning service for building and deploying models.
2,832 questions
{count} votes

1 answer

Sort by: Most helpful
  1. Leonie Wagner 0 Reputation points
    2023-09-22T19:17:05.5333333+00:00

    @romungi-MSFT
    I run the code without outputting my internal logs, changed number of workers, decreased batch size and changed number of nodes, but the error is still the same. I really would appreciate any help, why this instance does not work but any other one.

    Execution failed. User process 'python' terminated by signal with name SIGBUS. This could be caused by an out of memory error. Please check log file 'user_logs/std_log.txt' for error details. Error: starting time: 1695409800.7414021
    start training
    /azureml-envs/azureml_d868c9873bf0057cff8d7fa90d3d55ca/lib/python3.8/multiprocessing/resource_tracker.py:216: UserWarning: resource_tracker: There appear to be 4 leaked semaphore objects to clean up at shutdown
      warnings.warn('resource_tracker: There appear to be %d '
    
    0 comments No comments

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.