Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions third_party/nvfuser/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,7 @@ if(BUILD_TEST)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_gather_ops.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_multidevice.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_multicluster_fusion.cpp)
list(APPEND JIT_TEST_SRCS ${NVFUSER_ROOT}/test/test_gpu_combined_inner_outer_reduction.cpp)

set(JIT_TEST_CU_SRCS)
list(APPEND JIT_TEST_CU_SRCS ${NVFUSER_ROOT}/test/test_gpu_rng.cu)
Expand Down
25 changes: 5 additions & 20 deletions third_party/nvfuser/csrc/executor_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -967,26 +967,11 @@ c10::optional<int> getMaxRegCount(
// If the block size is known, set the maximum that at least allows
// one block to be resident on an SM
if (opt_block_size.has_value() && opt_block_size.value() > 0) {
int num_partition = 0;
int reg_allocation_granularity = 0;
const auto prop = at::cuda::getCurrentDeviceProperties();
cudaOccDeviceProp occ_prop(*prop);
cudaOccSubPartitionsPerMultiprocessor(&num_partition, &occ_prop);
cudaOccRegAllocationGranularity(&reg_allocation_granularity, &occ_prop);
int warp_size = prop->warpSize;
int num_warps = ceilDiv(opt_block_size.value(), warp_size);

// warps could be distributed unevenly across partition
int max_warps_per_sm_partition = ceilDiv(num_warps, num_partition);
// registers are evenly distributed across partitions, partition with most
// wraps determins the maximum register available per warp
int max_reg_per_warp =
prop->regsPerBlock / num_partition / max_warps_per_sm_partition;
// clamp down to register allocation granularity at warp level
int effective_max_reg_per_warp = max_reg_per_warp /
reg_allocation_granularity * reg_allocation_granularity;
max_register =
std::min(max_register_limit, effective_max_reg_per_warp / warp_size);
constexpr int block_per_sm = 1;
max_register = std::min(
max_register_limit,
(int)getRegPerThreadGivenThreadsPerSM(
opt_block_size.value() * block_per_sm));
}

// If a heuristic value is given, i.e., max_register_heuristic is
Expand Down
4 changes: 4 additions & 0 deletions third_party/nvfuser/csrc/ir_internal_nodes.h
Original file line number Diff line number Diff line change
Expand Up @@ -1486,6 +1486,10 @@ class TORCH_CUDA_CU_API IterDomain : public Val {
return getIterType() == IterType::Reduction;
}

bool isIteration() const {
return getIterType() == IterType::Iteration;
}

bool isRFactorProduct() const {
return is_rfactor_domain_;
}
Expand Down
3 changes: 2 additions & 1 deletion third_party/nvfuser/csrc/kernel_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,8 @@ std::vector<at::Tensor> FusionKernelRuntime::runKernelWithInput(
}

auto& executor = executors_[group_id];
if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose)) {
if (isDebugDumpEnabled(DebugDumpOption::PerfDebugVerbose) ||
measure_kernel_time_) {
executor.setMeasureKernelTimeFlag(true);
}

Expand Down
5 changes: 5 additions & 0 deletions third_party/nvfuser/csrc/kernel_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {
profiling_ = to_profile;
}

void setMeasureKernelTime(bool val = true) {
measure_kernel_time_ = val;
}

//! Internal knob for profiling shape inference
void disableLaunchParamCache() {
for (auto& executor : executors_) {
Expand Down Expand Up @@ -190,6 +194,7 @@ class TORCH_CUDA_CU_API FusionKernelRuntime {

// States for profiling support
bool profiling_ = false;
bool measure_kernel_time_ = false;

std::mutex mutex_;
// TODO: remove `compiling_` mutex and rely on `mutex_` only.
Expand Down
20 changes: 20 additions & 0 deletions third_party/nvfuser/csrc/maxinfo_propagator.h
Original file line number Diff line number Diff line change
Expand Up @@ -273,4 +273,24 @@ class TORCH_CUDA_CU_API SetSelector : public MaxInfoSpanningTree::Selector {
}
};

// Simple selector to allow different parallel patterns in the fusion.
// The propagation is blocked at boundaryNodesSet.
// For P2C forward propagate, disable propagation to tensorViews in
// boundaryNodesSet. For C2P backward propagate, disable propagation from
// tensorViews in boundaryNodesSet
struct InternalBoundarySelector : public MaxInfoSpanningTree::Selector {
std::unordered_set<TensorView*> tvs_;
virtual bool allowC2P(TensorView* from, TensorView* to) override {
return tvs_.count(from) == 0;
};
virtual bool allowP2C(TensorView* from, TensorView* to) override {
return tvs_.count(to) == 0;
};
virtual bool allowSibling(TensorView* from, TensorView* to) override {
return true;
}
InternalBoundarySelector(const std::unordered_set<TensorView*>& tvs)
: tvs_(tvs) {}
};

} // namespace nvfuser
Loading