diff --git a/src/xccl/Register.cpp b/src/xccl/Register.cpp index 5e2f54baa..4f955b050 100644 --- a/src/xccl/Register.cpp +++ b/src/xccl/Register.cpp @@ -2,8 +2,13 @@ #include #include #include +#include #include +static std::vector TORCH_XCCL_DISABLE_ALLREDUCE_PROFILER = { + "TORCH_XCCL_DISABLE_ALLREDUCE_PROFILER", + "XCCL_DISABLE_ALLREDUCE_PROFILER"}; + namespace c10d { namespace ops { namespace { @@ -84,6 +89,11 @@ std::tuple, c10::intrusive_ptr> allreduce_XPU( bool asyncOp, int64_t timeout) { auto tensor_vec = tensors.vec(); + // Due to bug in PTI, profiling needs to be skipped for allreduce between multiple nodes + // TODO: Remove this when PTI is fixed + bool disableAllreduceProfiler = getCvarBool(TORCH_XCCL_DISABLE_ALLREDUCE_PROFILER) && torch::autograd::profiler::profilerEnabled(); + if (disableAllreduceProfiler) + torch::autograd::profiler::toggleCollectionDynamic(false, {torch::autograd::profiler::ActivityType::XPU}); auto work = process_group->getBackend(c10::DeviceType::XPU) ->allreduce( tensor_vec, @@ -91,6 +101,8 @@ std::tuple, c10::intrusive_ptr> allreduce_XPU( *reduce_op.get(), std::chrono::milliseconds(timeout), asyncOp}); + if (disableAllreduceProfiler) + torch::autograd::profiler::toggleCollectionDynamic(true, {torch::autograd::profiler::ActivityType::XPU}); return std::tuple, c10::intrusive_ptr>( std::move(tensor_vec), work); }