Closed
Description
🐛 Bug
sym_size
support for dynamic shape fails on test_mp_sync_batch_norm
. The test fails on the changes made at this commit.
$ python test/test_mp_sync_batch_norm.py
2022-08-07 23:52:36.920806: W 1502404 tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2022-08-07 23:52:36.920866: W 1502404 tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
Traceback (most recent call last):
File "test/test_mp_sync_batch_norm.py", line 146, in <module>
xmp.spawn(_mp_fn, args=())
File "/opt/conda/lib/python3.7/site-packages/torch_xla-1.13-py3.7-linux-x86_64.egg/torch_xla/distributed/xla_multiprocessing.py", line 383, in spawn
return _run_direct(fn, args, nprocs, join, daemon, start_method)
File "/opt/conda/lib/python3.7/site-packages/torch_xla-1.13-py3.7-linux-x86_64.egg/torch_xla/distributed/xla_multiprocessing.py", line 344, in _run_direct
fn(0, *args)
File "test/test_mp_sync_batch_norm.py", line 139, in _mp_fn
sync_bn1d_no_channel_test(index)
File "test/test_mp_sync_batch_norm.py", line 49, in sync_bn1d_no_channel_test
result = run_step(sbn_xla, t_xla)
File "test/test_mp_sync_batch_norm.py", line 20, in run_step
xm.optimizer_step(optimizer)
File "/opt/conda/lib/python3.7/site-packages/torch_xla-1.13-py3.7-linux-x86_64.egg/torch_xla/core/xla_model.py", line 988, in optimizer_step
loss = optimizer.step(**optimizer_args)
File "/opt/conda/lib/python3.7/site-packages/torch/optim/optimizer.py", line 127, in wrapper
return func(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/torch/optim/optimizer.py", line 23, in _use_grad
ret = func(self, *args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/torch/optim/sgd.py", line 159, in step
foreach=group['foreach'])
File "/opt/conda/lib/python3.7/site-packages/torch/optim/sgd.py", line 209, in sgd
maximize=maximize)
File "/opt/conda/lib/python3.7/site-packages/torch/optim/sgd.py", line 243, in _single_tensor_sgd
param.add_(d_p, alpha=-lr)
RuntimeError: /workspace/pytorch/xla/torch_xla/csrc/data_ops.cpp:116 : Check failed: input_sizes.size() <= output_sizes.size() (2 vs. 1)
*** Begin stack trace ***
tensorflow::CurrentStackTrace[abi:cxx11]()
torch_xla::BuildExpand(xla::XlaOp, absl::lts_20211102::Span<long const>)
torch_xla::InferOutputShape(absl::lts_20211102::Span<xla::Shape const>, std::function<xla::XlaOp (absl::lts_20211102::Span<xla::XlaOp const>)> const&)
torch_xla::XlaNode::GetOpShape(std::function<xla::Shape ()> const&) const
torch_xla::XlaNode::XlaNode(torch::lazy::OpKind, c10::ArrayRef<torch::lazy::Value>, std::function<xla::Shape ()> const&, unsigned long, torch::lazy::hash_t)
torch_xla::Expand::Expand(torch::lazy::Value const&, std::vector<long, std::allocator<long> >)
torch_xla::XLATensor::copy_(c10::intrusive_ptr<torch_xla::XLATensor, c10::detail::intrusive_target_default_null_type<torch_xla::XLATensor> >&, c10::intrusive_ptr<torch_xla::XLATensor, c10::detail::intrusive_target_default_null_type<torch_xla::XLATensor> >&)
torch_xla::XLANativeFunctions::_copy_from(at::Tensor const&, at::Tensor const&, bool)
at::_ops::_copy_from::call(at::Tensor const&, at::Tensor const&, bool)
at::_ops::add__Tensor::call(at::Tensor&, at::Tensor const&, c10::Scalar const&)
_PyMethodDef_RawFastCallKeywords
_PyEval_EvalFrameDefault
_PyEval_EvalCodeWithName
_PyFunction_FastCallKeywords
_PyEval_EvalFrameDefault
_PyEval_EvalCodeWithName
_PyFunction_FastCallKeywords
_PyEval_EvalFrameDefault
_PyEval_EvalCodeWithName
_PyFunction_FastCallDict
_PyEval_EvalFrameDefault
_PyEval_EvalCodeWithName
_PyFunction_FastCallDict
_PyEval_EvalFrameDefault
_PyEval_EvalCodeWithName
_PyObject_FastCallDict
PyObject_Call
_PyEval_EvalFrameDefault
_PyEval_EvalCodeWithName
_PyFunction_FastCallKeywords
_PyEval_EvalFrameDefault
_PyFunction_FastCallKeywords
_PyEval_EvalFrameDefault
_PyFunction_FastCallKeywords
_PyEval_EvalFrameDefault
_PyFunction_FastCallDict
_PyEval_EvalFrameDefault
_PyFunction_FastCallKeywords
_PyEval_EvalFrameDefault
_PyEval_EvalCodeWithName
_PyFunction_FastCallKeywords
_PyEval_EvalFrameDefault
_PyEval_EvalCodeWithName
PyEval_EvalCode
PyRun_FileExFlags
PyRun_SimpleFileExFlags
_Py_UnixMain
__libc_start_main
*** End stack trace ***