csarofeen · jacobhinkle · Feb 23, 2023 · Feb 23, 2023 · Feb 24, 2023 · jacobhinkle
diff --git a/third_party/nvfuser/csrc/ops/arith.cpp b/third_party/nvfuser/csrc/ops/arith.cpp
@@ -880,6 +880,28 @@ NVFUSER_DEFINE_BINARY_FLOAT_OP(div, Div)
 NVFUSER_DEFINE_BINARY_FLOAT_OP(atan2, Atan2)
 #undef NVFUSER_DEFINE_BINARY_FLOAT_OP
 
+// These ops require full-precision floating point types (after float type
+// promotion)
+#define NVFUSER_DEFINE_BINARY_FLOAT_ONLY_OP(op_name, op_type)                \
+  Val* op_name(Val* v1, Val* v2) {                                           \
+    return binaryOp(                                                         \
+        BinaryOpType::op_type, v1, v2, TypePromotion::float_only_op_config); \
+  }                                                                          \
+  TensorView* op_name(TensorView* v1, Val* v2) {                             \
+    return binaryOp(                                                         \
+        BinaryOpType::op_type, v1, v2, TypePromotion::float_only_op_config); \
+  }                                                                          \
+  TensorView* op_name(Val* v1, TensorView* v2) {                             \
+    return binaryOp(                                                         \
+        BinaryOpType::op_type, v1, v2, TypePromotion::float_only_op_config); \
+  }                                                                          \
+  TensorView* op_name(TensorView* v1, TensorView* v2) {                      \
+    return binaryOp(                                                         \
+        BinaryOpType::op_type, v1, v2, TypePromotion::float_only_op_config); \
+  }
+NVFUSER_DEFINE_BINARY_FLOAT_ONLY_OP(nextafter, Nextafter)
+#undef NVFUSER_DEFINE_BINARY_FLOAT_ONLY_OP
+
 #define NVFUSER_DEFINE_BINARY_CAST_OP(op_name, op_type)                   \
   Val* op_name(Val* v1, Val* v2) {                                        \
     return binaryOp(                                                      \

diff --git a/third_party/nvfuser/csrc/ops/arith.h b/third_party/nvfuser/csrc/ops/arith.h
@@ -434,6 +434,12 @@ TORCH_CUDA_CU_API Val* sub(Val* v1, Val* v2);
 TORCH_CUDA_CU_API TensorView* sub(TensorView* v1, Val* v2);
 TORCH_CUDA_CU_API TensorView* sub(Val* v1, TensorView* v2);
 TORCH_CUDA_CU_API TensorView* sub(TensorView* v1, TensorView* v2);
+// nextafter: Only single- or double-precision
+// floating point types (after promotion) are supported.
+TORCH_CUDA_CU_API Val* nextafter(Val* v1, Val* v2);
+TORCH_CUDA_CU_API TensorView* nextafter(TensorView* v1, Val* v2);
+TORCH_CUDA_CU_API TensorView* nextafter(Val* v1, TensorView* v2);
+TORCH_CUDA_CU_API TensorView* nextafter(TensorView* v1, TensorView* v2);
 // Integer binary ops
 // mod
 TORCH_CUDA_CU_API Val* mod(Val* v1, Val* v2);

diff --git a/third_party/nvfuser/csrc/python_frontend/python_bindings.cpp b/third_party/nvfuser/csrc/python_frontend/python_bindings.cpp
@@ -571,6 +571,7 @@ void initNvFuserPythonBindings(PyObject* module) {
   NVFUSER_PYTHON_BINDING_BINARY_OP("bitwise_xor", bitwise_xor)
   NVFUSER_PYTHON_BINDING_BINARY_OP("bitwise_left_shift", bitwise_left_shift)
   NVFUSER_PYTHON_BINDING_BINARY_OP("bitwise_right_shift", bitwise_left_shift)
+  NVFUSER_PYTHON_BINDING_BINARY_OP("nextafter", nextafter)
 #undef NVFUSER_PYTHON_BINDING_BINARY_OP
 
 #define NVFUSER_PYTHON_BINDING_BINARY_WITH_ALPHA_OP(op_str, op_name)          \

diff --git a/third_party/nvfuser/csrc/type.cpp b/third_party/nvfuser/csrc/type.cpp
@@ -413,6 +413,8 @@ static const char* binary_op_type2string(BinaryOpType t) {
       return "remainder";
     case BinaryOpType::Sub:
       return "sub";
+    case BinaryOpType::Nextafter:
+      return "nextafter";
 
     // Integer Ops
     case BinaryOpType::Mod:

diff --git a/third_party/nvfuser/csrc/type.h b/third_party/nvfuser/csrc/type.h
@@ -252,6 +252,7 @@ enum class BinaryOpType {
   Pow,
   Remainder,
   Sub,
+  Nextafter,
   // TypeAs,
 
   // Integer output ops. If changing modify isIntegerOp

diff --git a/third_party/nvfuser/csrc/type_promotion.cpp b/third_party/nvfuser/csrc/type_promotion.cpp
@@ -100,6 +100,16 @@ c10::ScalarType computeTypes(
       c10::isIntegralType(common_dtype, /*includeBool=*/true)) {
     common_dtype = c10::get_default_dtype_as_scalartype();
   }
+
+  // Some ops like nextafter are not implemented for non-float types
+  if (config.require_full_precision_promoted) {
+    TORCH_CHECK(
+        common_dtype == c10::ScalarType::Float ||
+            common_dtype == c10::ScalarType::Double,
+        "Promoted type must be single or double precision float but found ",
+        common_dtype);
+  }
+
   return common_dtype;
 }
 

diff --git a/third_party/nvfuser/csrc/type_promotion.h b/third_party/nvfuser/csrc/type_promotion.h
@@ -22,6 +22,7 @@ namespace nvfuser {
 //!
 struct TypePromotionConfig {
   bool promote_integer_inputs_to_float = false;
+  bool require_full_precision_promoted = false;
   TypePromotionConfig() = default;
 };
 
@@ -31,6 +32,9 @@ static const TypePromotionConfig comparison_op_config;
 static const TypePromotionConfig default_op_config;
 static const TypePromotionConfig float_op_config{
     /* promote_integer_inputs_to_float */ true};
+static const TypePromotionConfig float_only_op_config{
+    /* promote_integer_inputs_to_float */ false,
+    /* require_full_precision_promoted */ true};
 
 } // namespace TypePromotion
 

diff --git a/third_party/nvfuser/python_tests/test_python_frontend.py b/third_party/nvfuser/python_tests/test_python_frontend.py
@@ -2,13 +2,15 @@
 
 from copy import deepcopy
 from functools import partial
+import itertools
 import re
 from typing import List
 import unittest
 
 import torch
 from torch.testing._internal.common_utils import run_tests, TEST_WITH_ROCM, TestCase
 from torch.testing._internal.jit_utils import RUN_CUDA
+from torch.testing import make_tensor
 import torch._refs as refs
 import torch._prims as prims
 # Will only create the nvfuser module if CUDA is available
@@ -987,5 +989,46 @@ def fusion_func(fd: FusionDefinition):
         eager_out = torch.full([2, 2], 1.0) * 5.0
         self.assertEqual(eager_out, nvf_out[0]) 
 
+    def test_nextafter(self):
+        inputs = [
+            # torch.nextafter is only defined for float{32,64} tensor inputs
+            make_tensor(4, device="cuda", dtype=torch.float32),
+            make_tensor(4, device="cuda", dtype=torch.float64),
+        ]
+
+        def fusion_func(fd: FusionDefinition):
+            t0 = fd.from_pytorch(inputs[0])
+            t1 = fd.from_pytorch(inputs[1])
+
+            s0 = fd.define_constant(1.0, dtype=DataType.Float)
+            s1 = fd.define_constant(-1.0, dtype=DataType.Double)
+
+            t2 = fd.ops.add(t0, s0)  # float
+            t3 = fd.ops.add(t1, s1)  # double
+
+            for a, b in itertools.product(
+                [t0, t1, s0, s1],
+                [t0, t1, s0, s1],
+            ):
+                # always enter the fusion...
+                t = fd.ops.nextafter(a, b)
+                if a in [t0, t1] or b in [t0, t1]:
+                    # ...but skip outputting scalars, which we don't support
+                    fd.add_output(t)
+
+        nvf_out, _ = self.exec_nvfuser(fusion_func, inputs)
+
+        ab = [inputs[0], inputs[1], 1.0, -1.0]
+        i = 0
+        for a, b in itertools.product(ab, ab):
+            if not (isinstance(a, torch.Tensor) or isinstance(b, torch.Tensor)):
+                continue
+            n = nvf_out[i]
+            i += 1
+            torch_out = torch.nextafter(
+                torch.as_tensor(a, device='cuda'), torch.as_tensor(b, device='cuda')
+            )
+            self.assertEqual(n, torch_out)
+
 if __name__ == '__main__':
     run_tests()
diff --git a/third_party/nvfuser/runtime/helpers.cu b/third_party/nvfuser/runtime/helpers.cu
@@ -313,6 +313,14 @@ __device__ constexpr float fmod(float a, float b) {
   return ::fmod(a, b);
 }
 
+__device__ constexpr double nextafter(double a, double b) {
+  return ::nextafter(a, b);
+}
+
+__device__ constexpr float nextafter(float a, float b) {
+  return ::nextafterf(a, b);
+}
+
 template <typename T>
 __device__ T pow(T a, T b) {
   if (b < 0) {