Hello! I'm newbie here, so sorry if i'm askinf some silly questions!
I'm trying to submit a job using Azure ML in Environment which is based on my docker container which is stored in Azure Container Registry. So the problem is that my program doesnt start in the chosen docker, it starts somewhere else as I see it. The thing is that using code below it actually creates an Environment which builds provided docker image and then in this Environment (on Azure Portal) I can see docker logs and etc, and if I want to view this Environment in code editor I can also see correct docker:
{
"assetId": null,
"databricks": {
"eggLibraries": [],
"jarLibraries": [],
"mavenLibraries": [],
"pypiLibraries": [],
"rcranLibraries": []
},
"docker": { #essentially docker which runs everything locally
"arguments": [],
"baseDockerfile": null,
"baseImage": "somename",
"baseImageRegistry": {
"address": "somevalue",
"password": "somevalue",
"registryIdentity": null,
"username": "somevalue"
},
"buildContext": null,
"enabled": true,
"platform": {
"architecture": "amd64",
"os": "Linux"
},
"sharedVolumes": true,
"shmSize": "2g"
},
"environmentVariables": {
"EXAMPLE_ENV_VAR": "EXAMPLE_VALUE"
},
"inferencingStackVersion": null,
"name": "somename",
"python": {
"baseCondaEnvironment": null,
"condaDependencies": {
"channels": [
"anaconda",
"conda-forge"
],
"dependencies": [
"python=3.8.13",
{
"pip": [
"azureml-defaults"
]
}
],
"name": "project_environment"
},
"condaDependenciesFile": null,
"interpreterPath": "python",
"userManagedDependencies": false
},
"r": null,
"spark": {
"packages": [],
"precachePackages": true,
"repositories": []
},
"version": null
}
I use this container locally to run the code, but for some reason when i'm trying to run the code in Azure ML it doesnt work for me. (it tell me that pandas is not installed, while it is installed in the provided docker) Below you can find code which I'm using.
docker_env = Environment("example")
docker_env.docker.enabled = True
docker_env.docker.base_image = docker_image_name
docker_env.docker.base_image_registry.address = docker_registry_address
docker_env.docker.base_image_registry.username = docker_registry_username
docker_env.docker.base_image_registry.password = docker_registry_password
docker_env.register(workspace=ws)
compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6s_v3',)
# create the cluster
compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
compute_target.wait_for_completion(show_output=True)
# Create an experiment
experiment_name = 'example'
experiment = Experiment(ws, name=experiment_name)
# Set up for training script and parameters
script_folder = script_folder
# Instantiate PyTorch estimator with upload of final model to a specified blob storage container (this can be anything)
estimator = ScriptRunConfig(
source_directory=script_folder,
compute_target=compute_target,
command='./train_cloud.sh',
environment=docker_env,
)
run = experiment.submit(estimator)
print(run.get_details())