Clip slice range expressions (#460)

jacobhinkle · naoyam · web-flow · commit 7b142b3d63ba · 2023-09-26T20:38:51.000-04:00
This PR normalizes the inputs to `slice` in order to mimic the semantics of numpy/PyTorch slicing. For an axis with extent `ext`, if we receive a slice of `(start, stop, step)` we normalize it to `(norm_start, norm_stop, step)` where ``` norm_start = max(0, start < 0 ? start + ext : start); norm_stop = max(norm_start, min(ext, stop < 0 ? stop + ext : stop)); ``` Specific changes in this PR: - Form the above expressions in the `slice` op. - Add shmoo tests that test various scenarios with constant and input size slices. The simple Fusion in the input range test prints like this: ``` Inputs: T0_g[ iS0{9} ], float i3, nvfuser_index_t i4, nvfuser_index_t Outputs: T1_g[ ?S2{( ( ( -( fmax(0, ( where(( i3 < 0 ), ( i3 + 9 ), i3) )) ) ) + 9 ) + ( ( fmax(( fmax(0, ( where(( i3 < 0 ), ( i3 + 9 ), i3) )) ), ( fmin(9, ( where(( i4 < 0 ), ( i4 + 9 ), i4) )) )) ) - 9 ) )}rf ], float %kernel_math { b7 = i3 < 0; i5 = i3 + 9; i9 = where(b7, i5, i3); i11 = fmax(0, i9); b15 = i4 < 0; i13 = i4 + 9; i17 = where(b15, i13, i4); i19 = fmin(9, i17); i21 = fmax(i11, i19); T1_g[ ?S2{( ( ( -( fmax(0, ( where(( i3 < 0 ), ( i3 + 9 ), i3) )) ) ) + 9 ) + ( ( fmax(( fmax(0, ( where(( i3 < 0 ), ( i3 + 9 ), i3) )) ), ( fmin(9, ( where(( i4 < 0 ), ( i4 + 9 ), i4) )) )) ) - 9 ) )}rf ] = slice( T0_g[ iS0{9} ], { {i11, i21, 1} } ) } T0_g[ iS0{9} ] root domain : (iS0{9}) contiguity: f leaf domain : (iS0{9}) T1_g[ ?S2{( ( ( -( fmax(0, ( where(( i3 < 0 ), ( i3 + 9 ), i3) )) ) ) + 9 ) + ( ( fmax(( fmax(0, ( where(( i3 < 0 ), ( i3 + 9 ), i3) )) ), ( fmin(9, ( where(( i4 < 0 ), ( i4 + 9 ), i4) )) )) ) - 9 ) )}rf ] root domain : (iS1{9}rf) Resize: iS1{9}rf by ( -( fmax(0, ( where(( i3 < 0 ), ( i3 + 9 ), i3) )) ) ) and ( ( fmax(( fmax(0, ( where(( i3 < 0 ), ( i3 + 9 ), i3) )) ), ( fmin(9, ( where(( i4 < 0 ), ( i4 + 9 ), i4) )) )) ) - 9 ) -> ?S2{( ( ( -( fmax(0, ( where(( i3 < 0 ), ( i3 + 9 ), i3) )) ) ) + 9 ) + ( ( fmax(( fmax(0, ( where(( i3 < 0 ), ( i3 + 9 ), i3) )) ), ( fmin(9, ( where(( i4 < 0 ), ( i4 + 9 ), i4) )) )) ) - 9 ) )}rf rfactor domain : (?S2{( ( ( -( fmax(0, ( where(( i3 < 0 ), ( i3 + 9 ), i3) )) ) ) + 9 ) + ( ( fmax(( fmax(0, ( where(( i3 < 0 ), ( i3 + 9 ), i3) )) ), ( fmin(9, ( where(( i4 < 0 ), ( i4 + 9 ), i4) )) )) ) - 9 ) )}rf) contiguity: t leaf domain : (?S2{( ( ( -( fmax(0, ( where(( i3 < 0 ), ( i3 + 9 ), i3) )) ) ) + 9 ) + ( ( fmax(( fmax(0, ( where(( i3 < 0 ), ( i3 + 9 ), i3) )) ), ( fmin(9, ( where(( i4 < 0 ), ( i4 + 9 ), i4) )) )) ) - 9 ) )}rf) ``` resulting in the following CUDA kernel: ```c++ __global__ void kernel1(Tensor<float, 1, 1> T0, nvfuser_index_t i0, nvfuser_index_t i1, Tensor<float, 1, 1> T1) { nvfuser_index_t i2; i2 = i0 + 9; bool b3; b3 = i0 < 0; nvfuser_index_t i4; i4 = b3 ? i2 : i0; nvfuser_index_t i5; i5 = max(0, i4); nvfuser_index_t i6; i6 = i1 + 9; bool b7; b7 = i1 < 0; nvfuser_index_t i8; i8 = b7 ? i6 : i1; nvfuser_index_t i9; i9 = min(9, i8); nvfuser_index_t i10; i10 = max(i5, i9); nvfuser_index_t i11; i11 = (-i5) + i10; nvfuser_index_t i12; i12 = i5 * T0.alloc_stride[0]; #pragma unroll 1 for(nvfuser_index_t i13 = 0; i13 < i11; ++i13) { T1[i13] = T0[(i12 + (T0.alloc_stride[0] * i13))]; } } ``` This PR does NOT simplify these expressions for non-constant inputs. This can be done at concretization, which will be left for a follow-up PR. Stacked on #892 and #895. Fixes #439. Fixes #52. --------- Co-authored-by: Naoya Maruyama <naoyam@users.noreply.github.com>
diff --git a/csrc/dynamic_transform.cpp b/csrc/dynamic_transform.cpp
@@ -316,7 +316,7 @@ void DynamicTransformConcretizationInfo::analyzeResizes(
         out_id->toString());
     auto extent_int = extent_val.as<int64_t>();
     NVF_ERROR(
-        extent_int > 0,
+        extent_int >= 0,
         "Invalid resized domain extent ",
         extent_int,
         " for domain ",
diff --git a/csrc/kernel_cache.cpp b/csrc/kernel_cache.cpp
@@ -897,8 +897,8 @@ FusionKernelRuntime::FusionKernelRuntime(
       fusion.get());
 
   if (isDebugDumpEnabled(DebugDumpOption::FusionIrPreseg)) {
-    std::cout << "Fusion IR after pre-segmenter optimization passes:"
-              << std::endl;
+    debug() << "Fusion IR after pre-segmenter optimization passes:"
+            << std::endl;
     fusion->printMath();
   }
 
diff --git a/csrc/ops/alias.cpp b/csrc/ops/alias.cpp
@@ -690,9 +690,6 @@ TensorView* cat(
   return out;
 }
 
-// Currently there's no error check about  the actual values of the
-// Slice parameters. For example, the start parameter of a range of a
-// domain is assumed to be >= 0 and < the extent of the domain.
 TensorView* slice(TensorView* inp, const std::vector<Slice>& ranges) {
   const auto inp_dom = TensorDomain::noReductions(inp->getMaybeRFactorDomain());
   const int ndims = static_cast<int>(inp_dom.size());
@@ -704,36 +701,58 @@ TensorView* slice(TensorView* inp, const std::vector<Slice>& ranges) {
       ", Expected: ",
       ndims);
 
-  auto normalize_slice_range = [](Slice range, Val* extent) -> Slice {
+  const auto normalize_slice_range = [](Slice range, Val* extent) -> Slice {
+    auto cast_extent =
+        SimplifyingIrBuilder::maybeCastExpr(DataType::Index, extent);
+
+    auto zero = FusionGuard::getCurFusion()->zeroVal(DataType::Index);
+
+    // norm_start = max(0, start < 0 ? start + extent : start)
     if (range.start == nullptr) {
-      range.start = FusionGuard::getCurFusion()->zeroVal();
-    }
-    if (range.stop == nullptr) {
-      range.stop = extent;
-    }
-    if (range.step == nullptr) {
-      range.step = FusionGuard::getCurFusion()->oneVal();
-    }
-    if (range.start->dtype() != DataType::Index) {
+      range.start = zero;
+    } else if (!range.start->isZeroInt()) {
       range.start =
           SimplifyingIrBuilder::maybeCastExpr(DataType::Index, range.start);
+      range.start = SimplifyingIrBuilder::maxExpr(
+          zero,
+          SimplifyingIrBuilder::whereExpr(
+              SimplifyingIrBuilder::ltExpr(range.start, zero),
+              SimplifyingIrBuilder::addExpr(range.start, cast_extent),
+              range.start));
     }
-    if (range.stop->dtype() != DataType::Index) {
+
+    // norm_stop = max(norm_start, min(extent, stop < 0 ? stop + extent : stop)
+    if (range.stop == nullptr) {
+      range.stop = cast_extent;
+    } else if (!range.stop->sameAs(extent)) {
       range.stop =
           SimplifyingIrBuilder::maybeCastExpr(DataType::Index, range.stop);
+      range.stop = SimplifyingIrBuilder::maxExpr(
+          range.start,
+          SimplifyingIrBuilder::minExpr(
+              cast_extent,
+              SimplifyingIrBuilder::whereExpr(
+                  SimplifyingIrBuilder::ltExpr(range.stop, zero),
+                  SimplifyingIrBuilder::addExpr(range.stop, cast_extent),
+                  range.stop)));
     }
-    if (range.step->dtype() != DataType::Index) {
+
+    // Ensure step is of type Index
+    if (range.step == nullptr) {
+      range.step = FusionGuard::getCurFusion()->oneVal(DataType::Index);
+    } else {
       range.step =
           SimplifyingIrBuilder::maybeCastExpr(DataType::Index, range.step);
     }
+
     return range;
   };
 
   for (auto& range : ranges) {
     // Step not supported yet
     NVF_CHECK(
         range.step == nullptr || range.step->isOneInt(),
-        "Unsupported step: ",
+        "Unsupported step (must be 1 or null): ",
         range.step->toString());
   }
 
@@ -754,12 +773,13 @@ TensorView* slice(TensorView* inp, const std::vector<Slice>& ranges) {
       out_root_id = inp_root_id->cloneWithoutRFactor();
       out_rf_id = out_root_id;
     } else {
+      // Clip the start and stop values to the extent of the input
       out_root_id =
           IterDomainBuilder(inp_root_id).is_rfactor_domain(true).build();
       out_rf_id = IterDomain::resize(
           out_root_id,
           SimplifyingIrBuilder::negExpr(range.start),
-          sub(range.stop, inp_root_id->extent()),
+          SimplifyingIrBuilder::subExpr(range.stop, inp_root_id->extent()),
           true);
       needs_real_slicing = true;
     }
diff --git a/csrc/ops/alias.h b/csrc/ops/alias.h
@@ -91,7 +91,9 @@ TensorView* cat(
     std::optional<IterType> iter_type_opt = std::nullopt);
 
 //! Return a tensor where each dimension is sliced as specified by the
-//! ranges parameter. Stepping must be one at this moment.
+//! ranges parameter. Stepping must be one at this moment. The semantics of
+//! slicing with negative values and values >= extent follow those of numpy and
+//! PyTorch.
 TensorView* slice(TensorView* inp, const std::vector<Slice>& ranges);
 
 } // namespace nvfuser
diff --git a/test/test_resize.cpp b/test/test_resize.cpp
@@ -1124,6 +1124,132 @@ TEST_F(ResizeTest, FusionResizeSlice5) {
   testValidate(&fusion, cg_outputs, aten_inputs, {t2, t4}, __LINE__, __FILE__);
 }
 
+std::vector<std::pair<int64_t, int64_t>> slice_cases(
+    {{0, 5},
+     {3, 9},
+     {3, 4},
+     {7, 5},
+     {0, 11},
+     {11, 13},
+     {-3, 8},
+     {-3, -1},
+     {-3, -5},
+     {13, -1},
+     {-11, 9},
+     {-11, 0},
+     {-13, -11}});
+
+// Test slice with a variety of constant ranges
+TEST_F(NVFuserTest, FusionResizeSliceConstantShmoo_CUDA) {
+  for (auto [start, stop] : slice_cases) {
+    Fusion fusion;
+    FusionGuard fg(&fusion);
+
+    std::vector<int64_t> shape({9});
+
+    // concrete shapes to avoid dynamic Fusion
+    auto tv0 = makeConcreteTensor(shape);
+    fusion.addInput(tv0);
+
+    auto tv1 = slice(
+        tv0, {{IrBuilder::create<Val>(start), IrBuilder::create<Val>(stop)}});
+    fusion.addOutput(tv1);
+
+    auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+    auto t0 = at::randn(shape, options);
+    std::vector<c10::IValue> aten_inputs({t0});
+
+    FusionExecutor fe;
+    fe.compileFusion(&fusion, aten_inputs);
+    auto cg_outputs = fe.runFusion(aten_inputs);
+
+    testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
+  }
+}
+
+// Test slice with a variety of non-constant input ranges
+TEST_F(NVFuserTest, FusionResizeSliceInputShmoo_CUDA) {
+  Fusion fusion;
+  FusionGuard fg(&fusion);
+
+  std::vector<int64_t> shape({9});
+
+  // concrete shapes to avoid dynamic Fusion
+  auto tv0 = makeConcreteTensor(shape);
+  auto s0 = IrBuilder::create<Val>(DataType::Index);
+  auto s1 = IrBuilder::create<Val>(DataType::Index);
+  fusion.addInput(tv0);
+  fusion.addInput(s0);
+  fusion.addInput(s1);
+
+  auto tv1 = slice(tv0, {{s0, s1}});
+  fusion.addOutput(tv1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  {
+    // Concretize so that we set output IterType as Iteration. We should now
+    // have expressions that work with any input range.
+    ExpressionEvaluator expr_eval;
+
+    expr_eval.bind(tv0->axis(0)->extent(), 9);
+    expr_eval.bind(s0, 0);
+    expr_eval.bind(s1, 9);
+
+    auto initial_info = DynamicTransform::getInitialInfo(&fusion);
+    auto info = DynamicTransformConcretizationInfo(&initial_info, &expr_eval);
+
+    DynamicTransform::concretizeFusion(&fusion, &info);
+    NVF_CHECK(
+        !fusion.hasDynamicTransform(), "Expected to have no dynamic transform");
+  }
+
+  FusionExecutor fe;
+  fe.compileFusion(&fusion);
+
+  auto t0 = at::randn(shape, options);
+  for (auto [start, stop] : slice_cases) {
+    std::vector<c10::IValue> aten_inputs({t0, start, stop});
+    auto cg_outputs = fe.runFusion(aten_inputs);
+
+    testValidate(&fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
+  }
+}
+
+// Same as FusionResizeSliceInputShmoo_CUDA but use FusionExecutorCache, which
+// might re-concretize when output sizes change
+TEST_F(NVFuserTest, FusionResizeSliceInputShmooFusionExecutorCache_CUDA) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  std::vector<int64_t> shape({9});
+
+  // concrete shapes to avoid dynamic Fusion
+  auto tv0 = makeConcreteTensor(shape);
+  auto s0 = IrBuilder::create<Val>(DataType::Index);
+  auto s1 = IrBuilder::create<Val>(DataType::Index);
+  fusion->addInput(tv0);
+  fusion->addInput(s0);
+  fusion->addInput(s1);
+
+  auto tv1 = slice(tv0, {{s0, s1}});
+  fusion->addOutput(tv1);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+
+  auto t0 = at::randn(shape, options);
+  for (auto [start, stop] : slice_cases) {
+    std::vector<c10::IValue> aten_inputs({t0, start, stop});
+    auto cg_outputs = fec.runFusionWithInputs(aten_inputs);
+
+    testValidate(fec.fusion(), cg_outputs, aten_inputs, __LINE__, __FILE__);
+  }
+}
+
 // Auto scheduled version of Slice1
 TEST_F(ResizeTest, FusionResizeSliceScheduler1) {
   auto fusion_ptr = std::make_unique<Fusion>();
@@ -2319,7 +2445,7 @@ TEST_F(ResizeTest, Slice1DVectorizeManual1) {
   FusionGuard fg(fusion_ptr.get());
 
   const int64_t slice_offset = 4;
-  const std::vector<int64_t> shape({1024 * 1024});
+  const std::vector<int64_t> shape({1024L * 1024L});
 
   // Using a concrete tensor to avoid dynamic reshape
   auto tv0 = makeContigConcreteTensor(shape);
@@ -2358,7 +2484,7 @@ TEST_F(ResizeTest, Slice1DVectorizeManual2) {
   FusionGuard fg(fusion_ptr.get());
 
   const int64_t slice_offset = 4;
-  const std::vector<int64_t> shape({1024 * 1024});
+  const std::vector<int64_t> shape({1024L * 1024L});
 
   auto tv0 = makeContigConcreteTensor(shape);
   fusion.addInput(tv0);
@@ -2414,7 +2540,7 @@ TEST_F(ResizeTest, Slice1DVectorizeManual3) {
   FusionGuard fg(fusion_ptr.get());
 
   const int64_t slice_offset = 4;
-  const std::vector<int64_t> shape({1024 * 1024});
+  const std::vector<int64_t> shape({1024L * 1024L});
 
   auto tv0 = makeContigConcreteTensor(shape);
   fusion.addInput(tv0);
@@ -2463,7 +2589,7 @@ TEST_F(ResizeTest, Slice1DVectorizeManual4) {
   auto& fusion = *fusion_ptr;
   FusionGuard fg(fusion_ptr.get());
 
-  const std::vector<int64_t> shape({1024 * 1024});
+  const std::vector<int64_t> shape({1024L * 1024L});
 
   auto tv0 = makeContigConcreteTensor({shape[0] - 4});
   fusion.addInput(tv0);
@@ -2505,7 +2631,7 @@ TEST_F(ResizeTest, Slice2DVectorizeManual1) {
   // The extent of the innermost domain is just 2, and the outer
   // domain is sliced. This slicing should be vectorizable by a
   // factor of 4 as the two domains can be merged and vectorized.
-  const std::vector<int64_t> shape({1024 * 1024, 2});
+  const std::vector<int64_t> shape({1024L * 1024L, 2});
 
   auto tv0 = makeContigConcreteTensor(shape);
   fusion.addInput(tv0);

Original file line number	Diff line number	Diff line change
`@@ -897,8 +897,8 @@ FusionKernelRuntime::FusionKernelRuntime(`
`897`	`897`	`fusion.get());`
`898`	`898`
`899`	`899`	`if (isDebugDumpEnabled(DebugDumpOption::FusionIrPreseg)) {`
`900`		`- std::cout << "Fusion IR after pre-segmenter optimization passes:"`
`901`		`- << std::endl;`
	`900`	`+ debug() << "Fusion IR after pre-segmenter optimization passes:"`
	`901`	`+ << std::endl;`
`902`	`902`	`fusion->printMath();`
`903`	`903`	`}`
`904`	`904`