Hi,
we have a process that we have been running for over a year and suddenly have started failing and the error is not very descriptive. In this process, we run a train evaluation and a registration process from Azure DevOps.
The script that we are running is:
def main():
# Get Azure machine learning workspace
login_azure = ServicePrincipalAuthentication(tenant_id, app_id, app_secret)
aml_workspace = Workspace.get(
name=workspace_name,
subscription_id=subscription_id,
resource_group=resource_group,
auth=login_azure,
)
myenv = Environment(name="myenv")
CONDA_YAML = "src/train/conda_dependencies.yml"
load_requirements_into_conda_yml(conda_yml=CONDA_YAML)
conda_dep = CondaDependencies(conda_dependencies_file_path=CONDA_YAML)
# We need this pip version
conda_dep.add_conda_package("pip==20.2.4")
files_whl = []
os.chdir("./ml_service/pipelines")
for file_whl in glob.glob("*.whl"):
files_whl.append(file_whl)
for file_whl in files_whl:
try:
whl_url = Environment.add_private_pip_wheel(
workspace=aml_workspace, file_path=file_whl, exist_ok=True
)
conda_dep.add_pip_package(whl_url)
except Exception:
print(f"Not able to add the wheel {file_whl}")
os.chdir("../../")
myenv.docker.base_image_registry.address = REGISTRY_CONTAINER_IMAGE
myenv.docker.base_image_registry.username = REGISTRY_CONTAINER_USERNAME
myenv.docker.base_image_registry.password = REGISTRY_CONTAINER_PASSWORD
# Environment Configuration
myenv.docker.enabled = True
myenv.python.user_managed_dependencies = False
myenv.docker.base_image = REGISTRY_BASE_IMAGE
myenv.python.conda_dependencies = conda_dep
run_config = RunConfiguration()
run_config.target = "template-trainc"
# Image configuration
model_name = PipelineParameter(name="model_name", default_value=model_name)
release_id = PipelineParameter(name="release_id", default_value="0")
run_config.environment = myenv
train_step = PythonScriptStep(
name="Train Model",
script_name=train_script_path,
compute_target=run_config.target,
source_directory=sources_directory_train,
arguments=["--release_id", release_id, "--model_name", model_name],
runconfig=run_config,
allow_reuse=False,
)
print("Step Train created")
evaluate_step = PythonScriptStep(
name="Evaluate Model ",
script_name=evaluate_script_path,
compute_target=run_config.target,
source_directory=sources_directory_train,
arguments=["--release_id", release_id, "--model_name", model_name],
runconfig=run_config,
allow_reuse=False,
)
print("Step Evaluate created")
evaluate_step.run_after(train_step)
steps = [evaluate_step]
train_pipeline = Pipeline(workspace=aml_workspace, steps=steps)
train_pipeline.validate()
published_pipeline = train_pipeline.publish(
name=pipeline_name, description="Model training/retraining pipeline", version=build_id
)
print(f"Published pipeline: {published_pipeline.name}")
print(f"for build {published_pipeline.version}")
the error is produced by this line:
published_pipeline = train_pipeline.publish(
name=pipeline_name, description="Model training/retraining pipeline", version=build_id
)
We are running the process in a container, that we have saved in azure acr, the container image that we are using is https://github.com/microsoft/MLOpsPython, with the requirements:
adal==1.2.2
antlr4-python3-runtime==4.7.2
applicationinsights==0.11.9
argcomplete==1.10.0
asn1crypto==0.24.0
atomicwrites==1.3.0
attrs==19.1.0
azure-batch==7.0.0
azure-cli==2.0.71
azure-cli-command-modules-nspkg==2.0.3
azure-cli-core==2.0.71
azure-cli-nspkg==3.0.4
azure-cli-telemetry==1.0.3
azure-common==1.1.23
azure-cosmos==3.1.1
azure-datalake-store==0.0.47
azure-functions-devops-build==0.0.22
azure-graphrbac==0.60.0
azure-keyvault==1.1.0
azure-mgmt-advisor==2.0.1
azure-mgmt-appconfiguration==0.1.0
azure-mgmt-applicationinsights==0.1.1
azure-mgmt-authorization==0.52.0
azure-mgmt-batch==6.0.0
azure-mgmt-batchai==2.0.0
azure-mgmt-billing==0.2.0
azure-mgmt-botservice==0.2.0
azure-mgmt-cdn==3.1.0
azure-mgmt-cognitiveservices==5.0.0
azure-mgmt-compute==6.0.0
azure-mgmt-consumption==2.0.0
azure-mgmt-containerinstance==1.5.0
azure-mgmt-containerregistry==3.0.0rc5
azure-mgmt-containerservice==5.3.0
azure-mgmt-cosmosdb==0.7.0
azure-mgmt-datalake-analytics==0.2.1
azure-mgmt-datalake-nspkg==3.0.1
azure-mgmt-datalake-store==0.5.0
azure-mgmt-datamigration==0.1.0
azure-mgmt-deploymentmanager==0.1.0
azure-mgmt-devtestlabs==2.2.0
azure-mgmt-dns==2.1.0
azure-mgmt-eventgrid==2.2.0
azure-mgmt-eventhub==2.6.0
azure-mgmt-hdinsight==1.1.0
azure-mgmt-imagebuilder==0.2.1
azure-mgmt-iotcentral==1.0.0
azure-mgmt-iothub==0.8.2
azure-mgmt-iothubprovisioningservices==0.2.0
azure-mgmt-keyvault==1.1.0
azure-mgmt-kusto==0.3.0
azure-mgmt-loganalytics==0.2.0
azure-mgmt-managedservices==1.0.0
azure-mgmt-managementgroups==0.2.0
azure-mgmt-maps==0.1.0
azure-mgmt-marketplaceordering==0.2.1
azure-mgmt-media==1.1.1
azure-mgmt-monitor==0.5.2
azure-mgmt-msi==0.2.0
azure-mgmt-netapp==0.5.0
azure-mgmt-network==3.0.0
azure-mgmt-nspkg==3.0.2
azure-mgmt-policyinsights==0.3.1
azure-mgmt-privatedns==0.1.0
azure-mgmt-rdbms==1.9.0
azure-mgmt-recoveryservices==0.4.0
azure-mgmt-recoveryservicesbackup==0.4.0
azure-mgmt-redis==6.0.0
azure-mgmt-relay==0.1.0
azure-mgmt-reservations==0.3.1
azure-mgmt-resource==2.2.0
azure-mgmt-search==2.1.0
azure-mgmt-security==0.1.0
azure-mgmt-servicebus==0.6.0
azure-mgmt-servicefabric==0.2.0
azure-mgmt-signalr==0.1.1
azure-mgmt-sql==0.13.0
azure-mgmt-sqlvirtualmachine==0.4.0
azure-mgmt-storage==4.0.0
azure-mgmt-trafficmanager==0.51.0
azure-mgmt-web==0.42.0
azure-multiapi-storage==0.2.4
azure-nspkg==3.0.2
azure-storage-blob==1.5.0
azure-storage-common==1.4.2
azureml==0.2.7
azureml-core==1.0.62
azureml-dataprep==1.1.17
azureml-dataprep-native==13.0.3
azureml-pipeline==1.0.62
azureml-pipeline-core==1.0.62
azureml-pipeline-steps==1.0.62
azureml-sdk==1.0.62
azureml-telemetry==1.0.62
azureml-train==1.0.62
azureml-train-core==1.0.62
azureml-train-restclients-hyperdrive==1.0.62
backports.tempfile==1.0
backports.weakref==1.0.post1
bcrypt==3.1.7
certifi==2019.3.9
cffi==1.11.5
chardet==3.0.4
cloudpickle==1.2.2
colorama==0.4.1
conda==4.3.16
contextlib2==0.5.5
cryptography==2.4.2
distro==1.4.0
docker==4.0.2
dotnetcore2==2.1.8.1
entrypoints==0.3
fabric==2.5.0
flake8==3.7.8
flake8-formatter-junit-xml==0.0.6
fusepy==3.0.1
humanfriendly==4.18
idna==2.8
importlib-metadata==0.23
invoke==1.3.0
isodate==0.6.0
javaproperties==0.5.1
jeepney==0.4.1
Jinja2==2.10.1
jmespath==0.9.4
jsondiff==1.2.0
jsonpickle==1.2
junit-xml==1.8
knack==0.6.3
MarkupSafe==1.1.1
mccabe==0.6.1
mock==2.0.0
more-itertools==7.2.0
msrest==0.6.10
msrestazure==0.6.2
ndg-httpsclient==0.5.1
numpy==1.19.4
oauthlib==3.1.0
pandas==1.1.4
paramiko==2.6.0
pathspec==0.5.9
pbr==5.4.3
pip==18.1
pluggy==0.13.0
portalocker==1.5.1
psutil==5.6.3
py==1.8.0
pyasn1==0.4.7
pycodestyle==2.5.0
pycosat==0.6.3
pycparser==2.19
pydocumentdb==2.3.3
pyflakes==2.1.1
Pygments==2.4.2
PyJWT==1.7.1
PyNaCl==1.3.0
pyOpenSSL==18.0.0
PySocks==1.6.8
pytest==4.3.0
python-dateutil==2.8.0
python-dotenv==0.10.3
pytz==2019.1
PyYAML==5.1.2
requests==2.22.0
requests-oauthlib==1.2.0
ruamel.yaml==0.16.12
ruamel.yaml.clib==0.2.2
scp==0.13.2
SecretStorage==3.1.1
setuptools==40.6.3
six==1.12.0
sshtunnel==0.1.5
tabulate==0.8.3
urllib3==1.24.1
vsts==0.1.25
vsts-cd-manager==1.0.2
websocket-client==0.56.0
wheel==0.30.0
xmltodict==0.12.0
zipp==0.6.0
Error:
File "/usr/local/lib/python3.7/site-packages/azureml/pipeline/core/_aeva_provider.py", line 100, in __init__
self.datatype_provider.ensure_default_datatypes()
File "/usr/local/lib/python3.7/site-packages/azureml/pipeline/core/_aeva_provider.py", line 1512, in ensure_default_datatypes
ids = [datatype.id for datatype in self.get_all_datatypes()]
File "/usr/local/lib/python3.7/site-packages/azureml/pipeline/core/_aeva_provider.py", line 1448, in get_all_datatypes
entities = self._service_caller.get_all_datatypes_async()
File "/usr/local/lib/python3.7/site-packages/azureml/pipeline/core/_restclients/aeva/service_caller.py", line 499, in get_all_datatypes_async
workspace_name=self._workspace_name, custom_headers=self._get_custom_headers())
File "/usr/local/lib/python3.7/site-packages/azureml/pipeline/core/_restclients/aeva/aml_pipelines_api10.py", line 813, in api_v10_subscriptions_by_subscription_id_resource_groups_by_resource_group_name_providers_microsoft_machine_learning_services_workspaces_by_workspace_name_data_types_get
response = self._client.send(request, header_parameters, stream=False, **operation_config)
File "/usr/local/lib/python3.7/site-packages/msrest/service_client.py", line 336, in send
pipeline_response = self.config.pipeline.run(request, **kwargs)
File "/usr/local/lib/python3.7/site-packages/msrest/pipeline/__init__.py", line 197, in run
return first_node.send(pipeline_request, **kwargs) # type: ignore
File "/usr/local/lib/python3.7/site-packages/msrest/pipeline/__init__.py", line 150, in send
response = self.next.send(request, **kwargs)
File "/usr/local/lib/python3.7/site-packages/msrest/pipeline/requests.py", line 137, in send
return self.next.send(request, **kwargs)
File "/usr/local/lib/python3.7/site-packages/msrest/pipeline/__init__.py", line 150, in send
response = self.next.send(request, **kwargs)
File "/usr/local/lib/python3.7/site-packages/msrest/pipeline/requests.py", line 193, in send
self.driver.send(request.http_request, **kwargs)
File "/usr/local/lib/python3.7/site-packages/msrest/universal_http/requests.py", line 333, in send
return super(RequestsHTTPSender, self).send(request, **requests_kwargs)
File "/usr/local/lib/python3.7/site-packages/msrest/universal_http/requests.py", line 145, in send
raise_with_traceback(ClientRequestError, msg, err)
File "/usr/local/lib/python3.7/site-packages/msrest/exceptions.py", line 51, in raise_with_traceback
raise error.with_traceback(exc_traceback)
File "/usr/local/lib/python3.7/site-packages/msrest/universal_http/requests.py", line 142, in send
**kwargs)
File "/usr/local/lib/python3.7/site-packages/requests/sessions.py", line 530, in request
resp = self.send(prep, **send_kwargs)
File "/usr/local/lib/python3.7/site-packages/requests/sessions.py", line 643, in send
r = adapter.send(request, **kwargs)
File "/usr/local/lib/python3.7/site-packages/requests/adapters.py", line 507, in send
raise RetryError(e, request=request)
msrest.exceptions.ClientRequestError: Error occurred in request., RetryError: HTTPSConnectionPool(host='westus2.api.azureml.ms', port=443): Max retries exceeded with url: /api/v1.0/subscriptions/01c989d5-4dec-4881-a9df-193efdcc5582/resourceGroups/trpacml01-AML-RG/providers/Microsoft.MachineLearningServices/workspaces/trpacml01-AML-WS/DataTypes (Caused by ResponseError('too many 530 error responses'))
##[error]Bash exited with code '1'.
Is there is any new kind of requirement? proxy? or the service is unavailable?
Thank you so much!