Why are the user_logs from my Azure ML inference job not streaming in VScode terminal through Python SDK v2?
Hi,
I am running a job on Azure Machine Learning through Python SDK v2.
I created an environment with my custom docker image stored in an ACR as shown below:
# Function to create or get existing Environment
def create_or_get_environment_from_acr(ml_client, acr_image_uri):
try:
env_docker_image = None
env_name = "res-ing-docker-env"
try:
env_docker_image = ml_client.environments.get(env_name, label="latest")
logger.info(f"Using existing environment '{env_name}' with latest version: {env_docker_image.version}")
except Exception as fetch_exception:
logger.info(f"Environment '{env_name}' not found. Creating a new one.")
if env_docker_image is None:
env_docker_image = Environment(
name=env_name,
image=acr_image_uri,
description="Environment created from a docker image",
)
ml_client.environments.create_or_update(env_docker_image)
logger.info(f"Environment with name {env_docker_image.name} is registered to workspace, the environment version is {env_docker_image.version}")
return env_docker_image
except Exception as e:
logger.error(f"Failed to create or fetch environment: {str(e)}")
raise
I created a compute cluster with Standard_DS3_v2
VM as shown below:
# Function to create or get existing compute cluster
def create_or_get_compute_cluster(ml_client, compute_name):
identity_config = IdentityConfiguration(type = ManagedServiceIdentityType.SYSTEM_ASSIGNED)
try:
compute_cluster = ml_client.compute.get(compute_name)
logger.info(f"Compute cluster '{compute_name}' already exists.")
return compute_cluster
except Exception:
try:
compute_cluster = AmlCompute(
name=compute_name,
size="STANDARD_DS3_V2",
idle_time_before_scale_down=120,
min_instances=0,
max_instances=1,
identity=identity_config
)
ml_client.compute.begin_create_or_update(compute_cluster).result()
logger.info(f"Compute cluster '{compute_name}' created")
return compute_cluster
except Exception as e:
logger.debug(f"Failed to create compute cluster: {str(e)}")
raise
I then attempt to run the job using the function below:
def run_job_with_docker_image(ml_client, env_vars: dict, env: Environment = None, compute_cluster = None):
job = command(
command="/opt/conda/envs/ad/bin/python /home/ad/resource_ingestion/resource_ingestion/main.py",
environment=env,
compute=compute_cluster.name,
instance_count=1,
environment_variables=env_vars,
display_name="resource_ingestion_job",
tags=None,
experiment_name="resource_ingestion_exp",
timeout=7200,
)
created_job = ml_client.jobs.create_or_update(job)
logger.info(f"Created job with name: {created_job.name}")
print("---------- Stream output start ----------")
ml_client.jobs.stream(created_job.name)
print("---------- Stream output end ----------")
ml_client.jobs.download(created_job.name, download_path="./tmp/ai_integration/azure_ml_test/job_test_logs", all=True)
return created_job
As you can see in the code above, I am creating the job using command()
and then steaming it.
But during streaming, I only get the output below in the terminal:
---------- Stream output start ----------
RunId: wheat_bee_mts8fxd729
Web View: <URL>
Execution Summary
=================
RunId: wheat_bee_mts8fxd729
Web View: <URL>
---------- Stream output end ----------
The URL in the output forwards me to the portal where I can view the job. I have two folders of logs called system_logs
and user_logs
.
I want the std_log.txt
file in the user_logs
folder to be streamed in my terminal. Temporarily, I am downloading the logs.
Kindly let me know if anyone has a solution or if I am missing out on something.
Thank you.