In the conda environment below,
channels:
- anaconda
- defaults
dependencies:- argon2-cffi=20.1.0=py37h27cfd23_1
- async_generator=1.10=py37h28b3542_0
- attrs=20.2.0=py_0
- backcall=0.2.0=pyhd3eb1b0_0
- blas=1.0=mkl
- bleach=3.3.1=pyhd3eb1b0_0
- ca-certificates=2021.7.5=h06a4308_1
- certifi=2021.5.30=py37h06a4308_0
- cffi=1.14.6=py37h400218f_0
- cycler=0.10.0=py37_0
- dbus=1.13.18=hb2f20db_0
- defusedxml=0.7.1=pyhd3eb1b0_0
- entrypoints=0.3=py37_0
- expat=2.3.0=h2531618_2
- fontconfig=2.13.1=h6c09931_0
- freetype=2.10.4=h5ab3b9f_0
- glib=2.68.1=h36276a3_0
- gst-plugins-base=1.14.0=h8213a91_2
- gstreamer=1.14.0=h28cd5cc_2
- icu=58.2=he6710b0_3
- importlib_metadata=3.10.0=hd3eb1b0_0
- intel-openmp=2021.2.0=h06a4308_610
- ipykernel=5.3.4=py37h5ca1d4c_0
- ipython_genutils=0.2.0=pyhd3eb1b0_1
- jpeg=9b=h024ee3a_2
- jsonschema=3.2.0=py_2
- jupyter_client=6.1.12=pyhd3eb1b0_0
- jupyter_core=4.7.1=py37h06a4308_0
- jupyterlab_pygments=0.1.2=py_0
- kiwisolver=1.3.1=py37h2531618_0
- lcms2=2.12=h3be6417_0
- ld_impl_linux-64=2.33.1=h53a641e_7
- libedit=3.1.20191231=h14c3975_1
- libffi=3.3=he6710b0_2
- libgcc-ng=9.1.0=hdf63c60_0
- libpng=1.6.37=hbc83047_0
- libsodium=1.0.18=h7b6447c_0
- libstdcxx-ng=9.1.0=hdf63c60_0
- libtiff=4.1.0=h2733197_1
- libuuid=1.0.3=h1bed415_2
- libxcb=1.14=h7b6447c_0
- libxml2=2.9.10=hb55368b_3
- lz4-c=1.9.3=h2531618_0
- markupsafe=1.1.1=py37h14c3975_1
- matplotlib=3.3.4=py37h06a4308_0
- matplotlib-base=3.3.4=py37h62a2d02_0
- mistune=0.8.4=py37h14c3975_1001
- mkl=2021.2.0=h06a4308_296
- mkl-service=2.3.0=py37h27cfd23_1
- mkl_fft=1.3.0=py37h42c9631_2
- mkl_random=1.2.1=py37ha9443f7_2
- nbclient=0.5.3=pyhd3eb1b0_0
- nbconvert=6.1.0=py37h06a4308_0
- nbformat=5.1.3=pyhd3eb1b0_0
- ncurses=6.2=he6710b0_1
- nest-asyncio=1.5.1=pyhd3eb1b0_0
- notebook=6.4.0=py37h06a4308_0
- olefile=0.46=py37_0
- openjpeg=2.3.0=h05c96fa_1
- openssl=1.1.1k=h27cfd23_0
- pandocfilters=1.4.3=py37h06a4308_1
- parso=0.8.2=pyhd3eb1b0_0
- pcre=8.44=he6710b0_0
- pickleshare=0.7.5=pyhd3eb1b0_1003
- pip=20.2.4=py37_0
- prometheus_client=0.11.0=pyhd3eb1b0_0
- ptyprocess=0.7.0=pyhd3eb1b0_2
- pycparser=2.20=py_2
- pyparsing=2.4.7=pyhd3eb1b0_0
- pyqt=5.9.2=py37h05f1152_2
- pyrsistent=0.17.3=py37h7b6447c_0
- python=3.7.9=h7579374_0
- python-dateutil=2.8.1=pyhd3eb1b0_0
- qt=5.9.7=h5867ecd_1
- readline=8.0=h7b6447c_0
- send2trash=1.5.0=pyhd3eb1b0_1
- setuptools=50.3.0=py37hb0f4dca_1
- sip=4.19.8=py37hf484d3e_0
- six=1.15.0=py37h06a4308_0
- sqlite=3.33.0=h62c20be_0
- terminado=0.9.4=py37h06a4308_0
- testpath=0.5.0=pyhd3eb1b0_0
- tk=8.6.10=hbc83047_0
- tornado=6.0.4=py37h7b6447c_1
- traitlets=5.0.5=pyhd3eb1b0_0
- wcwidth=0.2.5=py_0
- webencodings=0.5.1=py37_1
- wheel=0.35.1=py_0
- xz=5.2.5=h7b6447c_0
- zeromq=4.3.4=h2531618_0
- zlib=1.2.11=h7b6447c_3
- zstd=1.4.9=haebb681_0
- pip:
- absl-py==0.12.0
- adal==1.2.7
- alabaster==0.7.12
- antlr4-python3-runtime==4.8
- azure-common==1.1.27
- azure-core==1.16.0
- azure-graphrbac==0.61.1
- azure-mgmt-authorization==0.61.0
- azure-mgmt-containerregistry==8.0.0
- azure-mgmt-core==1.3.0
- azure-mgmt-keyvault==9.0.0
- azure-mgmt-resource==13.0.0
- azure-mgmt-storage==11.2.0
- azureml-core==1.32.0
- babel==2.9.0
- backports-tempfile==1.0
- backports-weakref==1.0.post1
- boto3==1.9.246
- botocore==1.12.246
- cachetools==4.2.2
- chardet==4.0.0
- coloredlogs==14.0
- contextlib2==0.6.0.post1
- cryptography==3.4.7
- datasets==1.4.1
- decorator==5.0.7
- dill==0.3.3
- docformatter==1.3
- docker==4.4.4
- docutils==0.15.2
- emoji==0.5.4
- filelock==3.0.12
- flake8==3.7.8
- flake8-bugbear==19.8.0
- fsspec==2021.4.0
- fvcore==0.1.1.post20200716
- gitdb2==2.0.5
- gitpython==3.0.3
- google-auth==1.30.0
- google-auth-oauthlib==0.4.4
- grpcio==1.37.0
- huggingface-hub==0.0.2
- humanfriendly==9.1
- hydra-core==1.0.6
- idna==2.10
- imagesize==1.2.0
- importlib-metadata==4.0.1
- importlib-resources==5.1.2
- ipython==7.19.0
- isodate==0.6.0
- jedi==0.18.0
- jeepney==0.7.0
- jinja2==2.11.3
- jmespath==0.10.0
- joblib==0.14.1
- jsonlines==1.2.0
- jsonpickle==2.0.0
- markdown==3.3.4
- markdown-it-py==0.5.8
- mccabe==0.6.1
- more-itertools==8.7.0
- msrest==0.6.21
- msrestazure==0.6.4
- multiprocess==0.70.11.1
- myst-parser==0.12.10
- ndg-httpsclient==0.5.1
- nltk==3.4.5
- numpy==1.17.5
- oauthlib==3.1.0
- omegaconf==2.0.6
- packaging==20.9
- pandas==1.1.1
- pathspec==0.8.1
- pexpect==4.7.0
- pillow==8.1.1
- pluggy==0.13.1
- portalocker==2.3.0
- prompt-toolkit==3.0.18
- protobuf==3.15.8
- py==1.10.0
- py-gfm==1.0.2
- py-rouge==1.1
- pyarrow==4.0.0
- pyasn1==0.4.8
- pyasn1-modules==0.2.8
- pycodestyle==2.5.0
- pyflakes==2.1.1
- pygments==2.8.1
- pyjwt==2.1.0
- pyopenssl==20.0.1
- pytest==5.3.2
- pytest-datadir==1.3.1
- pytest-regressions==2.1.1
- pytz==2021.1
- pyyaml==5.4
- pyzmq==18.1.0
- regex==2020.1.8
- requests==2.25.1
- requests-mock==1.7.0
- requests-oauthlib==1.3.0
- rsa==4.7.2
- ruamel-yaml==0.17.4
- ruamel-yaml-clib==0.2.6
- s3transfer==0.2.1
- scikit-learn==0.23.1
- scipy==1.4.1
- secretstorage==3.3.1
- sh==1.12.14
- smmap==4.0.0
- smmap2==3.0.1
- snowballstemmer==2.1.0
- sphinx==2.2.2
- sphinx-autodoc-typehints==1.10.3
- sphinx-rtd-theme==0.4.3
- sphinxcontrib-applehelp==1.0.2
- sphinxcontrib-devhelp==1.0.2
- sphinxcontrib-htmlhelp==1.0.3
- sphinxcontrib-jsmath==1.0.1
- sphinxcontrib-qthelp==1.0.3
- sphinxcontrib-serializinghtml==1.1.4
- subword-nmt==0.3.7
- tabulate==0.8.9
- tensorboard==2.3.0
- tensorboard-plugin-wit==1.8.0
- tensorboardx==2.1
- termcolor==1.1.0
- threadpoolctl==2.1.0
- tokenizers==0.10.2
- torch==1.8.1
- torchtext==0.9.1
- tqdm==4.36.1
- typing-extensions==3.7.4.1
- unidecode==1.1.1
- untokenize==0.1.1
- urllib3==1.25.11
- websocket-client==0.56.0
- websocket-server==0.4
- werkzeug==1.0.1
- xxhash==2.0.2
- yacs==0.1.8
- zipp==3.4.1
I implemented conda install pytorch torchvision torchaudio cudatoolkit=10.2 -c pytorch
, and export to yml file.
Then in order to create job to computing cliuster I implemented below
#A100ver
cluster_name = 'high-A100'
gpu_name = 'Standard_ND96asr_v4'
experiment_name = 'speaker_identification_training_A100'
hyperparameters = [
'--max_train_time', '172800'
]
script_folder = './script_folder'
# workspace
ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, sep='\t')
# compute cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", cluster_name)
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 4)
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", gpu_name)
if compute_name in ws.compute_targets:
compute_target = ws.compute_targets[compute_name]
if compute_target and type(compute_target) is AmlCompute:
print('found compute target. just use it. ' + compute_name)
else:
print('creating a new compute target...')
provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
min_nodes=compute_min_nodes,
max_nodes=compute_max_nodes)
compute_target = ComputeTarget.create(
ws, compute_name, provisioning_config)
env = Environment.load_from_directory(path="./.azureml6/")
exp = Experiment(workspace=ws,name=experiment_name)
command = "pwd && pip install azure-storage-blob && python main.py"
# run
src = ScriptRunConfig(source_directory=script_folder,
command=command,
compute_target=compute_target,
environment=env
)
run = exp.submit(config=src)
Actually I found that in order to use A100, pytoch version should be 1.8.1+cu111. But by implementing conda install pytorch==1.8.1 torchvision==0.9.0 torchaudio==0.8.0 cudatoolkit=11.1 -c pytorch -c conda-forge
, I got the error like below
Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: |
Found conflicts! Looking for incompatible packages.
This can take several minutes. Press CTRL-C to abort.
failedUnsatisfiableError: The following specifications were found
to be incompatible with the existing python installation in your environment:Specifications:
- pytorch==1.8.1 -> python[version='2.7.|3.5.|3.6.|3.6.12|3.6.12|3.7.10|3.7.10|>=2.7,<2.8.0a0|>=3.5,<3.6.0a0|>=3.5|>=3.7|>=3.6,<3.7|3.7.9|3.6.9|3.6.9|3.6.9|3.6.9|3.4.',build='1_73_pypy|2_73_pypy|3_73_pypy|4_73_pypy|1_73_pypy|0_73_pypy|5_73_pypy|5_73_pypy|0_73_pypy']
- torchaudio==0.8.0 -> python[version='2.7.|3.5.|3.6.|>=2.7,<2.8.0a0|>=3.5,<3.6.0a0|3.4.|3.9.*']
Your python: python==3.7.9=h7579374_0
If python is on the left-most side of the chain, that's the version you've asked for.
When python appears to the right, that indicates that the thing on the left is somehow
not available for the python version you are constrained to. Note that conda will not
change your python version to a different minor version unless you explicitly specify
that.The following specifications were found to be incompatible with each other:
Output in format: Requested package -> Available versions
Package cudnn conflicts for:
torchvision==0.9.0 -> pytorch[version='>=1.8.0',build=cuda*] -> cudnn[version='>=8.2.1.32,<9.0a0']
torchvision==0.9.0 -> cudnn[version='>=7.6.5.32,<8.0a0|>=8.1.0.77,<9.0a0']Package cudatoolkit conflicts for:
torchvision==0.9.0 -> cudatoolkit[version='10.2|10.2.|11.0|11.0.|11.1|11.1.|>=10.1,<10.2|>=10.2,<10.3|>=11.1,<11.2|11.2|11.2.']
torchaudio==0.8.0 -> pytorch==1.8.0 -> cudatoolkit[version='10.2|10.2.|11.0|11.0.|11.1|11.1.|11.2|11.2.|>=10.1,<10.2|>=11.1,<11.2|>=10.2,<10.3']
torchvision==0.9.0 -> cudnn[version='>=8.1.0.77,<9.0a0'] -> cudatoolkit[version='10.0|10.0.|10.1|10.1.|10.2.|11.|>=11.3,<11.4|9.2|9.2.*']
pytorch==1.8.1 -> cudatoolkit[version='>=10.1,<10.2|>=11.1,<11.2|>=10.2,<10.3']Package libstdcxx-ng conflicts for:
python==3.7.9=h7579374_0 -> libffi[version='>=3.3,<3.4.0a0'] -> libstdcxx-ng[version='>=7.3.0|>=7.5.0']
torchaudio==0.8.0 -> numpy[version='>=1.11'] -> libstdcxx-ng[version='>=4.9|>=7.3.0|>=9.3.0|>=7.5.0|>=7.2.0']
torchvision==0.9.0 -> libstdcxx-ng[version='>=7.5.0']
torchvision==0.9.0 -> cudatoolkit[version='>=11.1,<11.2'] -> libstdcxx-ng[version='>=3.4|>=4.9|>=7.3.0|>=9.3.0|>=7.2.0']
pytorch==1.8.1 -> cudatoolkit[version='>=11.1,<11.2'] -> libstdcxx-ng[version='>=4.9|>=7.3.0|>=9.3.0|>=7.2.0']
pytorch==1.8.1 -> libstdcxx-ng[version='>=7.5.0']
cudatoolkit=11.1 -> libstdcxx-ng[version='>=9.3.0']Package libgcc-ng conflicts for:
python==3.7.9=h7579374_0 -> libgcc-ng[version='>=7.3.0']
python==3.7.9=h7579374_0 -> libffi[version='>=3.3,<3.4.0a0'] -> libgcc-ng[version='>=4.9|>=7.5.0|>=9.4.0|>=9.3.0|>=7.2.0']Package _libgcc_mutex conflicts for:
python==3.7.9=h7579374_0 -> libgcc-ng[version='>=7.3.0'] -> _libgcc_mutex[version='|0.1|0.1',build='main|main|conda_forge']
cudatoolkit=11.1 -> libgcc-ng[version='>=9.3.0'] -> _libgcc_mutex[version='|0.1',build='main|main|conda_forge']
torchvision==0.9.0 -> libgcc-ng[version='>=7.5.0'] -> _libgcc_mutex[version='|0.1|0.1',build='main|main|conda_forge']
pytorch==1.8.1 -> _openmp_mutex -> _libgcc_mutex[version='|0.1',build='main|main|conda_forge']Package pytorch conflicts for:
torchvision==0.9.0 -> pytorch[version='1.8.0|>=1.8.0|>=1.8.0',build='cuda*|cpu*']
torchaudio==0.8.0 -> pytorch==1.8.0Package nccl conflicts for:
torchvision==0.9.0 -> pytorch==1.8.0 -> nccl[version='>=2.10.3.1,<3.0a0|>=2.7.8.1,<3.0a0|>=2.8.4.1,<3.0a0']
torchaudio==0.8.0 -> pytorch==1.8.0 -> nccl[version='>=2.7.8.1,<3.0a0|>=2.8.4.1,<3.0a0']Package typing-extensions conflicts for:
pytorch==1.8.1 -> typing-extensions
torchvision==0.9.0 -> pytorch[version='>=1.8.0',build=cpu*] -> typing-extensionsThe following specifications were found to be incompatible with your system:
- feature:/linux-64::__glibc==2.27=0
- feature:|@/linux-64::__glibc==2.27=0
- cudatoolkit=11.1 -> __glibc[version='>=2.17,<3.0.a0']
- cudatoolkit=11.1 -> libgcc-ng[version='>=9.3.0'] -> __glibc[version='>=2.17']
- pytorch==1.8.1 -> cudatoolkit[version='>=11.1,<11.2'] -> __glibc[version='>=2.17|>=2.17,<3.0.a0']
- torchaudio==0.8.0 -> pytorch==1.8.0 -> __glibc[version='>=2.17|>=2.17,<3.0.a0']
- torchvision==0.9.0 -> __glibc[version='>=2.17|>=2.17,<3.0.a0']
Your installed version is: 2.27
Can I solve this problem by adjusting the environment? or should I give up using A100?
Thank you so much