Hi. While using free compute instance in ml.studio wit envs/azureml/_py38__PT_TF (preinstalled environment), my noteboog training fails on cuDNN error: CUDNN_STATUS_EXECUTION_FAILED.
What I understand, the error emerges from pytorch library and up to my knowledge it leads to incompatibility of pytorch and cudnn.
I can run it on my home PC with RTX3060 without any troubles.
How could I fix this?
thank you, Jan
The error prompt is:
[INFO] training the network...
Epoch 1/10
/mnt/batch/tasks/shared/LS_root/mounts/clusters/escafree/code/Users/jan.kanka/ds_handle.py:175: UserWarning: Using padding='same' with even kernel lengths and odd dilation may require a zero-padded copy of the input be created (Triggered internally at /opt/conda/conda-bld/pytorch_1656352463056/work/aten/src/ATen/native/Convolution.cpp:882.)
outimg = F.conv2d(img[None].float(), kernel.float(), padding="same", groups=3)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Input In [5], in <cell line: 1>()
----> 1 escanet.train()
File /mnt/batch/tasks/shared/LS_root/mounts/clusters/escafree/code/Users/jan.kanka/classnet.py:310, in TrainModel.train(self)
308 x, y = http://x.to(self.device), http://y.to(self.device)
309 # perform a forward pass and calcualte the training lss
--> 310 pred = self.model(x)
311 loss = self.lossBCE(
312 torch.squeeze(pred), torch.squeeze(y.float())
313 ) # + diceloss(torch.squeeze(pred), torch.squeeze(y))jaccard_loss(torch.squeeze(pred), torch.squeeze(y)) + FocLossFn(torch.squeeze(pred), torch.squeeze(y)) #+ lossBCE(torch.squeeze(pred), torch.squeeze(y))
314 pred_bin = ((http://pred.to("cpu")).detach().numpy() > 0.5) * 1
File /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File /mnt/batch/tasks/shared/LS_root/mounts/clusters/escafree/code/Users/jan.kanka/classnet.py:156, in EscaClass.forward(self, Input)
155 def forward(self, Input):
--> 156 x = self.ConvPart(Input)
157 Output = self.DenseTop(x)
159 return Output
File /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File /mnt/batch/tasks/shared/LS_root/mounts/clusters/escafree/code/Users/jan.kanka/classnet.py:114, in ConvPart.forward(self, x)
112 x = self.pool1(x)
113 x = self.conv_1(x) # 64 -> 64
--> 114 x = self.BatchNorm1(x)
115 x = self.Activation1(x)
116 x = self.pool2(x)
File /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/modules/batchnorm.py:168, in _BatchNorm.forward(self, input)
161 bn_training = (self.running_mean is None) and (self.running_var is None)
163 r"""
164 Buffers are only updated if they are to be tracked and we are in training mode. Thus they only need to be
165 passed when the update should occur (i.e. in training mode when they are tracked), or when buffer stats are
166 used for normalization (i.e. in eval mode when buffers are not None).
167 """
--> 168 return F.batch_norm(
169 input,
170 # If buffers are not to be tracked, ensure that they won't be updated
171 self.running_mean
172 if not http://self.training or self.track_running_stats
173 else None,
174 self.running_var if not http://self.training or self.track_running_stats else None,
175 self.weight,
176 self.bias,
177 bn_training,
178 exponential_average_factor,
179 self.eps,
180 )
File /anaconda/envs/azureml_py38_PT_TF/lib/python3.8/site-packages/torch/nn/functional.py:2438, in batch_norm(input, running_mean, running_var, weight, bias, training, momentum, eps)
2435 if training:
2436 _verify_batch_size(input.size())
-> 2438 return torch.batch_norm(
2439 input, weight, bias, running_mean, running_var, training, momentum, eps, torch.backends.cudnn.enabled
2440 )
RuntimeError: cuDNN error: CUDNN_STATUS_EXECUTION_FAILED