Why are the user_logs from my Azure ML inference job not streaming in VScode terminal through Python SDK v2?

Anonymous
2024-10-07T08:28:08.21+00:00

Hi,
I am running a job on Azure Machine Learning through Python SDK v2.
I created an environment with my custom docker image stored in an ACR as shown below:

# Function to create or get existing Environment
def create_or_get_environment_from_acr(ml_client, acr_image_uri):
    try:
        env_docker_image = None
        env_name = "res-ing-docker-env"
        
        try:
            env_docker_image = ml_client.environments.get(env_name, label="latest")
            logger.info(f"Using existing environment '{env_name}' with latest version: {env_docker_image.version}")
        except Exception as fetch_exception:
            logger.info(f"Environment '{env_name}' not found. Creating a new one.")
        
        if env_docker_image is None:
            env_docker_image = Environment(
                name=env_name,
                image=acr_image_uri,
                description="Environment created from a docker image",
            )
            ml_client.environments.create_or_update(env_docker_image)
            logger.info(f"Environment with name {env_docker_image.name} is registered to workspace, the environment version is {env_docker_image.version}")
        
        return env_docker_image
    
    except Exception as e:
        logger.error(f"Failed to create or fetch environment: {str(e)}")
        raise

I created a compute cluster with Standard_DS3_v2 VM as shown below:

# Function to create or get existing compute cluster
def create_or_get_compute_cluster(ml_client, compute_name):
    identity_config = IdentityConfiguration(type = ManagedServiceIdentityType.SYSTEM_ASSIGNED)
    try:
        compute_cluster = ml_client.compute.get(compute_name)
        logger.info(f"Compute cluster '{compute_name}' already exists.")
        return compute_cluster
    except Exception:
        try:
            compute_cluster = AmlCompute(
                name=compute_name,
                size="STANDARD_DS3_V2",
                idle_time_before_scale_down=120,
                min_instances=0,
                max_instances=1,
                identity=identity_config
            )
            ml_client.compute.begin_create_or_update(compute_cluster).result()
            logger.info(f"Compute cluster '{compute_name}' created")
            return compute_cluster
        except Exception as e:
            logger.debug(f"Failed to create compute cluster: {str(e)}")
            raise

I then attempt to run the job using the function below:

def run_job_with_docker_image(ml_client, env_vars: dict, env: Environment = None, compute_cluster = None):
    job = command(
        command="/opt/conda/envs/ad/bin/python /home/ad/resource_ingestion/resource_ingestion/main.py",
        environment=env,
        compute=compute_cluster.name,
        instance_count=1,
        environment_variables=env_vars,
        display_name="resource_ingestion_job",
        tags=None,
        experiment_name="resource_ingestion_exp",
        timeout=7200,
    )

    created_job = ml_client.jobs.create_or_update(job)

    logger.info(f"Created job with name: {created_job.name}")

    print("---------- Stream output start ----------")
    ml_client.jobs.stream(created_job.name)
    print("---------- Stream output end ----------")

    ml_client.jobs.download(created_job.name, download_path="./tmp/ai_integration/azure_ml_test/job_test_logs", all=True)
    return created_job

As you can see in the code above, I am creating the job using command() and then steaming it.
But during streaming, I only get the output below in the terminal:

---------- Stream output start ----------
RunId: wheat_bee_mts8fxd729
Web View: <URL>
Execution Summary
=================
RunId: wheat_bee_mts8fxd729
Web View: <URL>
---------- Stream output end ----------

The URL in the output forwards me to the portal where I can view the job. I have two folders of logs called system_logs and user_logs .
I want the std_log.txt file in the user_logs folder to be streamed in my terminal. Temporarily, I am downloading the logs.
Kindly let me know if anyone has a solution or if I am missing out on something.
Thank you.

Azure Machine Learning
Azure Machine Learning
An Azure machine learning service for building and deploying models.
3,338 questions
{count} votes

Your answer

Answers can be marked as Accepted Answers by the question author, which helps users to know the answer solved the author's problem.