InfiniTensor
diff --git a/‎src/infiniop/devices/cuda/cuda_kernel_common.cuh‎
Lines changed: 6 additions & 4 deletions b/‎src/infiniop/devices/cuda/cuda_kernel_common.cuh‎
Lines changed: 6 additions & 4 deletions
diff --git a/‎src/infiniop/devices/maca/maca_kernel_common.h‎
Lines changed: 7 additions & 6 deletions b/‎src/infiniop/devices/maca/maca_kernel_common.h‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎src/infiniop/elementwise/elementwise.h‎
Lines changed: 39 additions & 39 deletions b/‎src/infiniop/elementwise/elementwise.h‎
Lines changed: 39 additions & 39 deletions
diff --git a/‎src/infiniop/ops/add/cpu/add_cpu.h‎
Lines changed: 1 addition & 1 deletion b/‎src/infiniop/ops/add/cpu/add_cpu.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/infiniop/ops/add/cuda/add_cuda_internal.cuh‎ renamed to ‎src/infiniop/ops/add/cuda/kernel.cuh‎ b/‎src/infiniop/ops/add/cuda/add_cuda_internal.cuh‎ renamed to ‎src/infiniop/ops/add/cuda/kernel.cuh‎
diff --git a/‎src/infiniop/ops/add/cuda/add_cuda.cu‎ renamed to ‎src/infiniop/ops/add/nvidia/add_nvidia.cu‎
Lines changed: 8 additions & 8 deletions b/‎src/infiniop/ops/add/cuda/add_cuda.cu‎ renamed to ‎src/infiniop/ops/add/nvidia/add_nvidia.cu‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/infiniop/ops/add/cuda/add_cuda.cuh‎ renamed to ‎src/infiniop/ops/add/nvidia/add_nvidia.cuh‎
Lines changed: 1 addition & 1 deletion b/‎src/infiniop/ops/add/cuda/add_cuda.cuh‎ renamed to ‎src/infiniop/ops/add/nvidia/add_nvidia.cuh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/infiniop/ops/add/operator.cc‎
Lines changed: 8 additions & 8 deletions b/‎src/infiniop/ops/add/operator.cc‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cuh‎
Lines changed: 0 additions & 8 deletions b/‎src/infiniop/ops/causal_softmax/cuda/causal_softmax_cuda.cuh‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎src/infiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh‎ renamed to ‎src/infiniop/ops/causal_softmax/cuda/kernel.cuh‎
Lines changed: 7 additions & 10 deletions b/‎src/infiniop/ops/causal_softmax/cuda/causal_softmax_kernel.cuh‎ renamed to ‎src/infiniop/ops/causal_softmax/cuda/kernel.cuh‎
Lines changed: 7 additions & 10 deletions
@@ -4,6 +4,9 @@
 #define INFINIOP_CUDA_KERNEL __global__ void
 #endif
 
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
 // Posible maximum number of threads per block for CUDA architectures
 // Used for picking correct kernel launch configuration
 #define CUDA_BLOCK_SIZE_4096 4096
@@ -12,8 +15,10 @@
 
 #define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
 
-namespace device::cuda {
+using cuda_bfloat16 = nv_bfloat16;
+using cuda_bfloat162 = nv_bfloat162;
 
+namespace device::cuda {
 // return the memory offset of original tensor, given the flattened index of broadcasted tensor
 __forceinline__ __device__ __host__ size_t
 indexToReducedOffset(
@@ -45,8 +50,6 @@ indexToOffset(
 }
 } // namespace device::cuda
 
-#ifdef ENABLE_NVIDIA_API
-#include <cuda_fp16.h>
 __forceinline__ __device__ float
 exp_(const float val) {
     return expf(val);
@@ -73,4 +76,3 @@ __forceinline__ __device__ __nv_bfloat16
 exp_(const __nv_bfloat16 x) {
     return hexp(x);
 }
-#endif
 
@@ -1,11 +1,15 @@
 #define INFINIOP_MACA_KERNEL __global__ void
+
 // Posible maximum number of threads per block for MACA architectures
 // Used for picking correct kernel launch configuration
 #define MACA_BLOCK_SIZE_1024 1024
 #define MACA_BLOCK_SIZE_512 512
 
 #define CHECK_MACA(API) CHECK_INTERNAL(API, hcSuccess)
 
+using cuda_bfloat16 = hpcc_bfloat16;
+using cuda_bfloat162 = hpcc_bfloat162;
+
 namespace device::maca {
 
 // return the memory offset of original tensor, given the flattened index of broadcasted tensor
@@ -39,16 +43,14 @@ indexToOffset(
 }
 } // namespace device::maca
 
-#ifdef ENABLE_MACA_API
-#include <maca_fp16.h>
 __forceinline__ __device__ float
 exp_(const float val) {
     return expf(val);
 }
 
 __forceinline__ __device__ long double
 exp_(const long double val) {
-    return expl(val);
+    return exp(val);
 }
 
 __forceinline__ __device__ double
@@ -61,8 +63,7 @@ exp_(const __half x) {
     return hexp(x);
 }
 
-__forceinline__ __device__ __hpcc_bfloat16;
-exp_(const __hpcc_bfloat16; x) {
+__forceinline__ __device__ __hpcc_bfloat16
+exp_(const __hpcc_bfloat16 x) {
     return hexp(x);
 }
-#endif
 
@@ -12,45 +12,45 @@
 #include <numeric>
 #include <vector>
 
-#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE)                                 \
-                                                                              \
-    namespace op::OP::NAMESPACE {                                             \
-    class Descriptor final : public InfiniopDescriptor {                      \
-        infiniDtype_t _dtype;                                                 \
-        op::elementwise::ElementwiseInfo _info;                               \
-        std::unique_ptr<op::elementwise::NAMESPACE::DeviceImpl> _device_info; \
-        size_t _workspace_size;                                               \
-                                                                              \
-        Descriptor(                                                           \
-            infiniDtype_t dtype,                                              \
-            op::elementwise::ElementwiseInfo info,                            \
-            op::elementwise::NAMESPACE::DeviceImpl *device_info,              \
-            size_t workspace_size,                                            \
-            infiniDevice_t device_type,                                       \
-            int device_id)                                                    \
-            : InfiniopDescriptor{device_type, device_id},                     \
-              _dtype(dtype),                                                  \
-              _info(std::move(info)),                                         \
-              _device_info(std::move(device_info)),                           \
-              _workspace_size(workspace_size) {}                              \
-                                                                              \
-    public:                                                                   \
-        ~Descriptor();                                                        \
-                                                                              \
-        size_t workspaceSize() const { return _workspace_size; }              \
-                                                                              \
-        static infiniStatus_t create(                                         \
-            infiniopHandle_t handle,                                          \
-            Descriptor **desc_ptr,                                            \
-            infiniopTensorDescriptor_t output_desc,                           \
-            std::vector<infiniopTensorDescriptor_t> input_descs);             \
-                                                                              \
-        infiniStatus_t calculate(                                             \
-            void *workspace, size_t workspace_size,                           \
-            void *output,                                                     \
-            std::vector<const void *> inputs,                                 \
-            void *stream) const;                                              \
-    };                                                                        \
+#define ELEMENTWISE_DESCRIPTOR(OP, NAMESPACE, KERNEL_COMMON)                      \
+                                                                                  \
+    namespace op::OP::NAMESPACE {                                                 \
+    class Descriptor final : public InfiniopDescriptor {                          \
+        infiniDtype_t _dtype;                                                     \
+        op::elementwise::ElementwiseInfo _info;                                   \
+        std::unique_ptr<op::elementwise::KERNEL_COMMON::DeviceImpl> _device_info; \
+        size_t _workspace_size;                                                   \
+                                                                                  \
+        Descriptor(                                                               \
+            infiniDtype_t dtype,                                                  \
+            op::elementwise::ElementwiseInfo info,                                \
+            op::elementwise::KERNEL_COMMON::DeviceImpl *device_info,              \
+            size_t workspace_size,                                                \
+            infiniDevice_t device_type,                                           \
+            int device_id)                                                        \
+            : InfiniopDescriptor{device_type, device_id},                         \
+              _dtype(dtype),                                                      \
+              _info(std::move(info)),                                             \
+              _device_info(std::move(device_info)),                               \
+              _workspace_size(workspace_size) {}                                  \
+                                                                                  \
+    public:                                                                       \
+        ~Descriptor();                                                            \
+                                                                                  \
+        size_t workspaceSize() const { return _workspace_size; }                  \
+                                                                                  \
+        static infiniStatus_t create(                                             \
+            infiniopHandle_t handle,                                              \
+            Descriptor **desc_ptr,                                                \
+            infiniopTensorDescriptor_t output_desc,                               \
+            std::vector<infiniopTensorDescriptor_t> input_descs);                 \
+                                                                                  \
+        infiniStatus_t calculate(                                                 \
+            void *workspace, size_t workspace_size,                               \
+            void *output,                                                         \
+            std::vector<const void *> inputs,                                     \
+            void *stream) const;                                                  \
+    };                                                                            \
     }
 
 namespace op::elementwise {
 
@@ -3,7 +3,7 @@
 
 #include "../../../elementwise/cpu/elementwise_cpu.h"
 
-ELEMENTWISE_DESCRIPTOR(add, cpu)
+ELEMENTWISE_DESCRIPTOR(add, cpu, cpu)
 
 namespace op::add::cpu {
 typedef struct AddOp {
 
@@ -1,7 +1,7 @@
-#include "add_cuda.cuh"
-#include "add_cuda_internal.cuh"
+#include "../cuda/kernel.cuh"
+#include "add_nvidia.cuh"
 
-namespace op::add::cuda {
+namespace op::add::nvidia {
 
 Descriptor::~Descriptor() = default;
 
@@ -43,17 +43,17 @@ infiniStatus_t Descriptor::calculate(
 
     switch (_dtype) {
     case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, AddOp, half>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::AddOp, half>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, AddOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::AddOp, __nv_bfloat16>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, AddOp, float>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::AddOp, float>(_info, workspace, output, inputs, stream);
     case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, AddOp, double>(_info, workspace, output, inputs, stream);
+        return _device_info->calculate<256, cuda::AddOp, double>(_info, workspace, output, inputs, stream);
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
 
     return INFINI_STATUS_SUCCESS;
 }
-} // namespace op::add::cuda
+} // namespace op::add::nvidia
@@ -3,6 +3,6 @@
 
 #include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
 
-ELEMENTWISE_DESCRIPTOR(add, cuda)
+ELEMENTWISE_DESCRIPTOR(add, nvidia, cuda)
 
 #endif // __ADD_CUDA_API_H__
@@ -6,7 +6,7 @@
 #include "cpu/add_cpu.h"
 #endif
 #ifdef ENABLE_NVIDIA_API
-#include "cuda/add_cuda.cuh"
+#include "nvidia/add_nvidia.cuh"
 #endif
 
 __C infiniStatus_t infiniopCreateAddDescriptor(
@@ -31,7 +31,7 @@ __C infiniStatus_t infiniopCreateAddDescriptor(
         CREATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:
@@ -46,14 +46,14 @@ __C infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, siz
 #define GET(CASE, NAMESPACE)                                                               \
     case CASE:                                                                             \
         *size = reinterpret_cast<op::add::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
-        return INFINI_STATUS_SUCCESS;
+        return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
 #ifdef ENABLE_CPU_API
-        GET(INFINI_DEVICE_CPU, cpu)
+        GET(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        GET(INFINI_DEVICE_NVIDIA, cuda)
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -83,7 +83,7 @@ __C infiniStatus_t infiniopAdd(
         CALCULATE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:
@@ -99,15 +99,15 @@ infiniopDestroyAddDescriptor(infiniopAddDescriptor_t desc) {
 #define DELETE(CASE, NAMESPACE)                                                \
     case CASE:                                                                 \
         delete reinterpret_cast<const op::add::NAMESPACE::Descriptor *>(desc); \
-        return INFINI_STATUS_SUCCESS;
+        return INFINI_STATUS_SUCCESS
 
     switch (desc->device_type) {
 
 #ifdef ENABLE_CPU_API
         DELETE(INFINI_DEVICE_CPU, cpu);
 #endif
 #ifdef ENABLE_NVIDIA_API
-        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
 #endif
 
     default:
 
@@ -1,11 +1,8 @@
-#ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__
+#ifndef __CAUSAL_SOFTMAX_KERNEL_CUH__
 #define __CAUSAL_SOFTMAX_KERNEL_CUH__
 
-#include "../../../devices/cuda/cuda_kernel_common.cuh"
-#include "../../../reduce/cuda/reduce.cuh"
-
 template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-INFINIOP_CUDA_KERNEL causalSoftmax(
+__device__ void causalSoftmaxKernel(
     Tdata *y_, const Tdata *x_,
     size_t batch, size_t height, size_t width,
     ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
@@ -32,11 +29,11 @@ INFINIOP_CUDA_KERNEL causalSoftmax(
         //          2 | * * * ... * * * |
         //  height: 3  col_id->
         if (width + blockIdx.x >= threadIdx.x + height) {
-#ifdef ENABLE_NVIDIA_API
-            y[col] = exp_(x[col] - max_);
-#else
-            y[col] = exp(x[col] - max_);
-#endif
+            if constexpr (std::is_same_v<Tdata, half> || std::is_same_v<Tdata, cuda_bfloat16>) {
+                y[col] = hexp(x[col] - max_);
+            } else {
+                y[col] = exp(x[col] - max_);
+            }
         } else {
             y[col] = Tdata(0);
         }