From f262825fa3d5fc5a3e2b8cd6c496eedc05c7b144 Mon Sep 17 00:00:00 2001
From: S-hhhhh <2320230838@mail.nankai.edu.cn>
Date: Wed, 13 Aug 2025 22:46:58 +0800
Subject: [PATCH 01/11] feat: add equal_nan param

---
 src/infiniop-test/include/test.hpp           | 71 ++++++++++----------
 src/infiniop-test/src/gguf.cpp               |  4 +-
 src/infiniop-test/src/main.cpp               | 14 +++-
 src/infiniop-test/src/ops/add.cpp            |  8 +--
 src/infiniop-test/src/ops/causal_softmax.cpp |  8 +--
 src/infiniop-test/src/ops/clip.cpp           |  8 +--
 src/infiniop-test/src/ops/gemm.cpp           |  8 +--
 src/infiniop-test/src/ops/mul.cpp            |  8 +--
 src/infiniop-test/src/ops/random_sample.cpp  |  8 +--
 src/infiniop-test/src/ops/rearrange.cpp      |  4 +-
 src/infiniop-test/src/ops/rms_norm.cpp       |  8 +--
 src/infiniop-test/src/ops/rope.cpp           |  8 +--
 src/infiniop-test/src/ops/sub.cpp            |  8 +--
 src/infiniop-test/src/ops/swiglu.cpp         |  8 +--
 src/infiniop-test/src/test.cpp               | 29 +++++---
 15 files changed, 113 insertions(+), 89 deletions(-)
diff --git a/src/infiniop-test/include/test.hpp b/src/infiniop-test/include/test.hpp
index e2dd45f9f..58b56b877 100644
--- a/src/infiniop-test/include/test.hpp
+++ b/src/infiniop-test/include/test.hpp
@@ -47,7 +47,7 @@ std::vector<std::shared_ptr<Result>> runAllTests(
     const GGUFFileReader &,
     infiniDevice_t device, int device_id,
     size_t warm_ups, size_t iterations,
-    double rtol, double atol);
+    double rtol, double atol, bool equal_nan=false);
 
 // Run a single test read from a GGUF file
 std::shared_ptr<Result> runTest(
@@ -55,10 +55,11 @@ std::shared_ptr<Result> runTest(
     infiniDevice_t device, int device_id,
     size_t warm_ups, size_t iterations,
     double rtol, double atol,
-    size_t test_id);
+    size_t test_id, 
+    bool equal_nan=false);
 
 // Check if two tensors are close within given tolerance
-void allClose(std::shared_ptr<Tensor> actual, std::shared_ptr<Tensor> expected, double rtol = 1e-3, double atol = 1e-3);
+void allClose(std::shared_ptr<Tensor> actual, std::shared_ptr<Tensor> expected, double rtol = 1e-3, double atol = 1e-3, bool equal_nan = false);
 
 // Check if two tensors are equal
 void allEqual(std::shared_ptr<Tensor> actual, std::shared_ptr<Tensor> expected);
@@ -81,43 +82,45 @@ class Test {
 } // namespace infiniop_test::base
 
 // Quick macro for declaring a new testcase
-#define DECLARE_INFINIOP_TEST(name)                                           \
-    namespace infiniop_test::name {                                           \
-    class Test : public infiniop_test::base::Test {                           \
-        double _rtol, _atol;                                                  \
-                                                                              \
-    public:                                                                   \
-        static std::string op_name() { return #name; }                        \
-        static std::shared_ptr<Test> build(                                   \
-            std::unordered_map<std::string, std::vector<uint8_t>> attributes, \
-            std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors, \
-            double, double);                                                  \
-                                                                              \
-        static std::vector<std::string> attribute_names();                    \
-        static std::vector<std::string> tensor_names();                       \
-        static std::vector<std::string> output_names();                       \
-                                                                              \
-        std::shared_ptr<infiniop_test::Result> run(                           \
-            infiniopHandle_t handle, infiniDevice_t device, int device_id,    \
-            size_t warm_ups, size_t iterations) override;                     \
-                                                                              \
-        std::string toString() const override;                                \
-                                                                              \
-        ~Test();                                                              \
-                                                                              \
-    private:                                                                  \
-        struct Attributes;                                                    \
-        Attributes *_attributes;                                              \
-        Test() = delete;                                                      \
-        Test(double rtol, double atol) : _rtol(rtol), _atol(atol) {}          \
-    };                                                                        \
+#define DECLARE_INFINIOP_TEST(name)                                                         \
+    namespace infiniop_test::name {                                                         \
+    class Test : public infiniop_test::base::Test {                                         \
+        double _rtol, _atol;                                                                \
+        bool _equal_nan;                                                                    \
+                                                                                            \
+    public:                                                                                 \
+        static std::string op_name() { return #name; }                                      \
+        static std::shared_ptr<Test> build(                                                 \
+            std::unordered_map<std::string, std::vector<uint8_t>> attributes,               \
+            std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,               \
+            double, double, bool);                                                          \
+                                                                                            \
+        static std::vector<std::string> attribute_names();                                  \
+        static std::vector<std::string> tensor_names();                                     \
+        static std::vector<std::string> output_names();                                     \
+                                                                                            \
+        std::shared_ptr<infiniop_test::Result> run(                                         \
+            infiniopHandle_t handle, infiniDevice_t device, int device_id,                  \
+            size_t warm_ups, size_t iterations) override;                                   \
+                                                                                            \
+        std::string toString() const override;                                              \
+                                                                                            \
+        ~Test();                                                                            \
+                                                                                            \
+    private:                                                                                \
+        struct Attributes;                                                                  \
+        Attributes *_attributes;                                                            \
+        Test() = delete;                                                                    \
+        Test(double rtol, double atol, bool equal_nan = false)                              \
+        : _rtol(rtol), _atol(atol), _equal_nan(equal_nan) {}                                \
+    };                                                                                      \
     }
 
 namespace infiniop_test {
 using BuilderFunc = std::function<std::shared_ptr<infiniop_test::base::Test>(
     std::unordered_map<std::string, std::vector<uint8_t>>,
     std::unordered_map<std::string, std::shared_ptr<Tensor>>,
-    double, double)>;
+    double, double, bool)>;
 
 // Testcase Registry
 // Each testcase should provid a formatted builder, attribute names, and tensor names
diff --git a/src/infiniop-test/src/gguf.cpp b/src/infiniop-test/src/gguf.cpp
index a4b200033..aee5b39a8 100644
--- a/src/infiniop-test/src/gguf.cpp
+++ b/src/infiniop-test/src/gguf.cpp
@@ -53,7 +53,9 @@ GGUFFileReader::GGUFFileReader(const std::string &filepath) {
     try {
         _file = std::make_shared<FileMapping>(filepath);
     } catch (const std::exception &e) {
-        throw e;
+        // throw e;
+        std::cerr << "Error: " << e.what() << std::endl;
+        // throw e;
     }
     _data = _file->ptr();
     _cursor = reinterpret_cast<uint8_t *>(_data);
diff --git a/src/infiniop-test/src/main.cpp b/src/infiniop-test/src/main.cpp
index 4863c8172..9436cd85a 100644
--- a/src/infiniop-test/src/main.cpp
+++ b/src/infiniop-test/src/main.cpp
@@ -2,7 +2,7 @@
 #include "test.hpp"
 #include <infinirt.h>
 #include <iostream>
-
+#include <cstring>
 struct ParsedArgs {
     std::string file_path;                          // Mandatory argument: test.gguf file path
     infiniDevice_t device_type = INFINI_DEVICE_CPU; // Default to CPU
@@ -11,12 +11,13 @@ struct ParsedArgs {
     int iterations = 0;                             // Default to 0 if not given
     double atol = 0.001;                            // Default absolute tolerance
     double rtol = 0.001;                            // Default relative tolerance
+    bool equal_nan = false;                         // Default relative tolerance
 };
 
 void printUsage() {
     std::cout << "Usage:" << std::endl
               << std::endl;
-    std::cout << "infiniop-test <test.gguf> [--<device>[:id]] [--warmup <warmups>] [--run <iterations>] [--atol <atol>] [--rtol <rtol>]" << std::endl
+    std::cout << "infiniop-test <test.gguf> [--<device>[:id]] [--warmup <warmups>] [--run <iterations>] [--atol <atol>] [--rtol <rtol>] [--equal-nan <equal nan>]" << std::endl
               << std::endl;
     std::cout << "  <test.gguf>>" << std::endl;
     std::cout << "    Path to the test gguf file" << std::endl
@@ -36,6 +37,9 @@ void printUsage() {
     std::cout << "  --rtol <relative_tolerance>" << std::endl;
     std::cout << "    (Optional) Relative tolerance for correctness check. Default to 0.001" << std::endl
               << std::endl;
+    std::cout << "  --equal-nan <compare NaNs as equal>" << std::endl;
+    std::cout << "    (Optional) If True, then two NaNs will be considered equal. Default to False" << std::endl
+              << std::endl;
     exit(-1);
 }
 
@@ -91,6 +95,10 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
             else if (arg == "--rtol" && i + 1 < argc) {
                 args.rtol = std::stod(argv[++i]);
             }
+            else if (arg == "--equal-nan" && i + 1 < argc) {
+                args.equal_nan = (strcmp(argv[++i], "True") == 0 || strcmp(argv[i], "true") == 0) 
+                                                            ? true : false;
+            }
             else {
                 printUsage();
             }
@@ -119,7 +127,7 @@ int main(int argc, char *argv[]) {
             reader,
             (infiniDevice_t)args.device_type, args.device_id,
             args.warmups, args.iterations,
-            args.rtol, args.atol);
+            args.rtol, args.atol, args.equal_nan);
 
         std::cout << "=====================================" << std::endl;
         for (auto result : results) {
diff --git a/src/infiniop-test/src/ops/add.cpp b/src/infiniop-test/src/ops/add.cpp
index 27f69d687..e90290d55 100644
--- a/src/infiniop-test/src/ops/add.cpp
+++ b/src/infiniop-test/src/ops/add.cpp
@@ -15,8 +15,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("a") == tensors.end()
         || tensors.find("b") == tensors.end()
@@ -58,7 +58,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -98,7 +98,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/causal_softmax.cpp b/src/infiniop-test/src/ops/causal_softmax.cpp
index 29612960a..97c65ef8c 100644
--- a/src/infiniop-test/src/ops/causal_softmax.cpp
+++ b/src/infiniop-test/src/ops/causal_softmax.cpp
@@ -14,8 +14,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("x") == tensors.end()
         || tensors.find("y") == tensors.end()
@@ -53,7 +53,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(y, _attributes->ans, _rtol, _atol);
+        allClose(y, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -92,7 +92,7 @@ std::string Test::toString() const {
     oss << "- y: " << _attributes->y->info() << std::endl;
     oss << "- ans: " << _attributes->ans->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/clip.cpp b/src/infiniop-test/src/ops/clip.cpp
index 82a0e9b10..a01c18a4d 100644
--- a/src/infiniop-test/src/ops/clip.cpp
+++ b/src/infiniop-test/src/ops/clip.cpp
@@ -16,8 +16,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("x") == tensors.end()
         || tensors.find("min_val") == tensors.end()
@@ -64,7 +64,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(y, _attributes->ans, _rtol, _atol);
+        allClose(y, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -109,7 +109,7 @@ std::string Test::toString() const {
     oss << "- max_val: " << _attributes->max_val->info() << std::endl;
     oss << "- y: " << _attributes->y->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/gemm.cpp b/src/infiniop-test/src/ops/gemm.cpp
index 37c8ed6fe..664288d73 100644
--- a/src/infiniop-test/src/ops/gemm.cpp
+++ b/src/infiniop-test/src/ops/gemm.cpp
@@ -18,8 +18,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
         throw std::runtime_error("Invalid Test");
@@ -65,7 +65,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -120,7 +120,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/mul.cpp b/src/infiniop-test/src/ops/mul.cpp
index 8ebfc426b..cb0b639bf 100644
--- a/src/infiniop-test/src/ops/mul.cpp
+++ b/src/infiniop-test/src/ops/mul.cpp
@@ -15,8 +15,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("a") == tensors.end()
         || tensors.find("b") == tensors.end()
@@ -58,7 +58,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -98,7 +98,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/random_sample.cpp b/src/infiniop-test/src/ops/random_sample.cpp
index a11e0f446..75ee07b44 100644
--- a/src/infiniop-test/src/ops/random_sample.cpp
+++ b/src/infiniop-test/src/ops/random_sample.cpp
@@ -20,8 +20,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
         throw std::runtime_error("Invalid Test");
@@ -70,7 +70,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(result, _attributes->ans, _rtol, _atol);
+        allClose(result, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -117,7 +117,7 @@ std::string Test::toString() const {
     oss << "- data: " << _attributes->data->info() << std::endl;
     oss << "- result: " << _attributes->result->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/rearrange.cpp b/src/infiniop-test/src/ops/rearrange.cpp
index 9fbf6f2cb..bdf162ce2 100644
--- a/src/infiniop-test/src/ops/rearrange.cpp
+++ b/src/infiniop-test/src/ops/rearrange.cpp
@@ -12,9 +12,9 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
+    double rtol, double atol, bool equal_nan) {
 
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
         throw std::runtime_error("Invalid Test");
diff --git a/src/infiniop-test/src/ops/rms_norm.cpp b/src/infiniop-test/src/ops/rms_norm.cpp
index 8359a4536..786ce8470 100644
--- a/src/infiniop-test/src/ops/rms_norm.cpp
+++ b/src/infiniop-test/src/ops/rms_norm.cpp
@@ -16,8 +16,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
 
     if (attributes.find("epsilon") == attributes.end()
@@ -72,7 +72,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "RMSNorm execution failed"));
 
     try {
-        allClose(y, _attributes->ans, _rtol, _atol);
+        allClose(y, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -117,7 +117,7 @@ std::string Test::toString() const {
     oss << "- w: " << _attributes->w->info() << std::endl;
     oss << "- y: " << _attributes->y->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/rope.cpp b/src/infiniop-test/src/ops/rope.cpp
index 636f565af..94010a122 100644
--- a/src/infiniop-test/src/ops/rope.cpp
+++ b/src/infiniop-test/src/ops/rope.cpp
@@ -17,8 +17,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
 
     if (tensors.find("y") == tensors.end()
@@ -77,7 +77,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(y, _attributes->ans, _rtol, _atol);
+        allClose(y, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -121,7 +121,7 @@ std::string Test::toString() const {
     oss << "- sin_table: " << _attributes->sin_table->info() << std::endl;
     oss << "- cos_table: " << _attributes->cos_table->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/sub.cpp b/src/infiniop-test/src/ops/sub.cpp
index 6bb1fd1eb..bb3adc350 100644
--- a/src/infiniop-test/src/ops/sub.cpp
+++ b/src/infiniop-test/src/ops/sub.cpp
@@ -15,8 +15,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
     if (tensors.find("a") == tensors.end()
         || tensors.find("b") == tensors.end()
@@ -58,7 +58,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
              return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
 
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -98,7 +98,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/ops/swiglu.cpp b/src/infiniop-test/src/ops/swiglu.cpp
index 96b75efc5..f86dfadc6 100644
--- a/src/infiniop-test/src/ops/swiglu.cpp
+++ b/src/infiniop-test/src/ops/swiglu.cpp
@@ -15,8 +15,8 @@ struct Test::Attributes {
 std::shared_ptr<Test> Test::build(
     std::unordered_map<std::string, std::vector<uint8_t>> attributes,
     std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
-    double rtol, double atol) {
-    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
     test->_attributes = new Attributes();
 
     if (tensors.find("a") == tensors.end()
@@ -54,7 +54,7 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     CHECK_OR(infiniopSwiGLU(op_desc, workspace, workspace_size, c->data(), a->data(), b->data(), nullptr),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed during execution."));
     try {
-        allClose(c, _attributes->ans, _rtol, _atol);
+        allClose(c, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -93,7 +93,7 @@ std::string Test::toString() const {
     oss << "- b: " << _attributes->b->info() << std::endl;
     oss << "- c: " << _attributes->c->info() << std::endl;
     oss << std::scientific << std::setprecision(2);
-    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
     return oss.str();
 }
 
diff --git a/src/infiniop-test/src/test.cpp b/src/infiniop-test/src/test.cpp
index e312ac5f5..d4c0bab49 100644
--- a/src/infiniop-test/src/test.cpp
+++ b/src/infiniop-test/src/test.cpp
@@ -49,7 +49,7 @@ std::string Result::toString() const {
 std::vector<std::shared_ptr<Result>> runAllTests(const GGUFFileReader &gguf_reader,
                                                  infiniDevice_t device, int device_id,
                                                  size_t warm_ups, size_t iterations,
-                                                 double rtol, double atol) {
+                                                 double rtol, double atol, bool equal_nan) {
     auto meta = gguf_reader.getAttributeMap();
     auto count_meta = meta.find("test_count");
     if (count_meta == meta.end()) {
@@ -60,7 +60,7 @@ std::vector<std::shared_ptr<Result>> runAllTests(const GGUFFileReader &gguf_read
     auto results = std::vector<std::shared_ptr<Result>>(count);
     try {
         for (size_t i = 0; i < count; i++) {
-            results[i] = runTest(gguf_reader, device, device_id, warm_ups, iterations, rtol, atol, i);
+            results[i] = runTest(gguf_reader, device, device_id, warm_ups, iterations, rtol, atol, i, equal_nan);
         }
     } catch (const std::exception &e) {
         std::cerr << "Error: " << e.what() << std::endl;
@@ -72,7 +72,7 @@ std::vector<std::shared_ptr<Result>> runAllTests(const GGUFFileReader &gguf_read
 std::shared_ptr<Result> runTest(const GGUFFileReader &gguf_reader,
                                 infiniDevice_t device, int device_id,
                                 size_t warm_ups, size_t iterations,
-                                double rtol, double atol, size_t test_id) {
+                                double rtol, double atol, size_t test_id, bool equal_nan) {
     auto meta = gguf_reader.getAttributeMap();
     auto tensor_info = gguf_reader.getTensorInfoMap();
     auto name_meta = meta.find("test." + std::to_string(test_id) + ".op_name");
@@ -107,7 +107,7 @@ std::shared_ptr<Result> runTest(const GGUFFileReader &gguf_reader,
         }
         std::shared_ptr<infiniop_test::base::Test> test;
         try {
-            test = builder.build(attrs, tensors, rtol, atol);
+            test = builder.build(attrs, tensors, rtol, atol, equal_nan);
         } catch (const std::exception &e) {
             return TEST_INIT_FAILED(op_name + "/n" + e.what());
         }
@@ -141,7 +141,7 @@ void incrementOffset(ptrdiff_t &offset_1, const std::vector<ptrdiff_t> &strides_
     }
 }
 
-void allClose(std::shared_ptr<Tensor> actual_, std::shared_ptr<Tensor> expected_, double rtol, double atol) {
+void allClose(std::shared_ptr<Tensor> actual_, std::shared_ptr<Tensor> expected_, double rtol, double atol, bool equal_nan) {
     auto actual = actual_->to(INFINI_DEVICE_CPU);
     auto expected = expected_->to(INFINI_DEVICE_CPU);
     auto shape = actual->shape();
@@ -158,12 +158,23 @@ void allClose(std::shared_ptr<Tensor> actual_, std::shared_ptr<Tensor> expected_
     for (size_t i = 0; i < total; i++) {
         double a_ = getVal((char *)actual->data() + actual_offset, actual->ggml_type());
         double e_ = getVal((char *)expected->data() + expected_offset, expected->ggml_type());
-        if (std::fabs(a_ - e_) > atol && std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) {
-            if (num_failed == 0) {
-                first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + ".";
+        if (std::isnan(a_) || std::isnan(e_)){
+            if ((equal_nan && (std::isnan(a_) != std::isnan(e_))) || !equal_nan){
+                num_failed ++;
+                if (num_failed == 0) {
+                    first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + ".";
+                }
+            }
+        }
+        else{
+            if (std::fabs(a_ - e_) > atol && std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) {
+                if (num_failed == 0) {
+                    first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + ".";
+                }
+                num_failed++;
             }
-            num_failed++;
         }
+        
         incrementOffset(actual_offset, actual->strides(), ggmlTypeSize(actual->ggml_type()),
                         expected_offset, expected->strides(), ggmlTypeSize(expected->ggml_type()),
                         counter, shape);

From 53927875e6f286c05fcbdb47d781f9016248608b Mon Sep 17 00:00:00 2001
From: S-hhhhh <2320230838@mail.nankai.edu.cn>
Date: Thu, 14 Aug 2025 22:23:20 +0800
Subject: [PATCH 02/11] =?UTF-8?q?=E6=B7=BB=E5=8A=A0reduce=5Fmean=E7=AE=97?=
 =?UTF-8?q?=E5=AD=90?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/infiniop.h                            |   1 +
 include/infiniop/ops/reduce_mean.h            |  27 +++
 scripts/python_test.py                        |   1 +
 src/infiniop-test/include/ops.hpp             |   2 +
 src/infiniop-test/src/ops/reduce_mean.cpp     | 119 ++++++++++++
 .../ops/reduce_mean/cpu/reduce_mean_cpu.cc    |  67 +++++++
 .../ops/reduce_mean/cpu/reduce_mean_cpu.h     |   7 +
 src/infiniop/ops/reduce_mean/info.h           |  81 ++++++++
 .../ops/reduce_mean/metax/reduce_mean_metax.h |   8 +
 .../reduce_mean/metax/reduce_mean_metax.maca  |  94 +++++++++
 .../ops/reduce_mean/nvidia/kernel.cuh         |  26 +++
 .../reduce_mean/nvidia/reduce_mean_nvidia.cu  |  98 ++++++++++
 .../reduce_mean/nvidia/reduce_mean_nvidia.cuh |   8 +
 src/infiniop/ops/reduce_mean/operator.cc      | 182 ++++++++++++++++++
 src/infiniop/ops/reduce_mean/reduce_mean.h    |  47 +++++
 src/infiniop/reduce/cuda/reduce.cuh           |  18 ++
 src/utils/check.h                             |  18 ++
 .../test_generate/testcases/reduce_mean.py    | 111 +++++++++++
 test/infiniop/libinfiniop/op_register.py      |  32 +++
 test/infiniop/reduce_mean.py                  | 150 +++++++++++++++
 20 files changed, 1097 insertions(+)
 create mode 100644 include/infiniop/ops/reduce_mean.h
 create mode 100644 src/infiniop-test/src/ops/reduce_mean.cpp
 create mode 100644 src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
 create mode 100644 src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h
 create mode 100644 src/infiniop/ops/reduce_mean/info.h
 create mode 100644 src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h
 create mode 100644 src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca
 create mode 100644 src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
 create mode 100644 src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
 create mode 100644 src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh
 create mode 100644 src/infiniop/ops/reduce_mean/operator.cc
 create mode 100644 src/infiniop/ops/reduce_mean/reduce_mean.h
 create mode 100644 test/infiniop-test/test_generate/testcases/reduce_mean.py
 create mode 100644 test/infiniop/reduce_mean.py

diff --git a/include/infiniop.h b/include/infiniop.h
index d51b8d92e..85df0c703 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -17,5 +17,6 @@
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/tensor_descriptor.h"
+#include "infiniop/ops/reduce_mean.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/reduce_mean.h b/include/infiniop/ops/reduce_mean.h
new file mode 100644
index 000000000..5efd8b227
--- /dev/null
+++ b/include/infiniop/ops/reduce_mean.h
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_REDUCE_MEAN_API_H__
+#define __INFINIOP_REDUCE_MEAN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopReduceMeanDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateReduceMeanDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceMeanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim);
+
+__C __export infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopReduceMean(
+    infiniopReduceMeanDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc);
+
+#endif
diff --git a/scripts/python_test.py b/scripts/python_test.py
index eb2d4319e..14588cd7d 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -24,6 +24,7 @@ def run_tests(args):
         "rope.py",
         "sub.py",
         "swiglu.py",
+        "reduce_mean.py",
     ]:
         result = subprocess.run(
             f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 3820f7cfd..c94ebac90 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -16,6 +16,7 @@ DECLARE_INFINIOP_TEST(add)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(sub)
+DECLARE_INFINIOP_TEST(reduce_mean)
 
 #define REGISTER_INFINIOP_TEST(name)                      \
     {                                                     \
@@ -43,6 +44,7 @@ DECLARE_INFINIOP_TEST(sub)
         REGISTER_INFINIOP_TEST(causal_softmax) \
         REGISTER_INFINIOP_TEST(rearrange)      \
         REGISTER_INFINIOP_TEST(sub)            \
+        REGISTER_INFINIOP_TEST(reduce_mean)    \
     }
 
 namespace infiniop_test {
diff --git a/src/infiniop-test/src/ops/reduce_mean.cpp b/src/infiniop-test/src/ops/reduce_mean.cpp
new file mode 100644
index 000000000..2c3ddceaf
--- /dev/null
+++ b/src/infiniop-test/src/ops/reduce_mean.cpp
@@ -0,0 +1,119 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::reduce_mean {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> x;
+    std::shared_ptr<Tensor> y;
+    std::shared_ptr<Tensor> ans;
+    size_t dim;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
+    test->_attributes = new Attributes();
+
+    if (attributes.find("dim") == attributes.end()
+        || tensors.find("x") == tensors.end()
+        || tensors.find("ans") == tensors.end()
+        || tensors.find("y") == tensors.end()) {
+        throw std::runtime_error("Invalid Test: Missing attributes or tensors");
+    }
+
+    test->_attributes->dim = size_t(*reinterpret_cast<uint64_t *>(attributes["dim"].data()));
+    test->_attributes->ans = tensors["ans"];
+    test->_attributes->x = tensors["x"];
+    test->_attributes->y = tensors["y"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopReduceMeanDescriptor_t op_desc;
+    CHECK_OR(infiniopCreateReduceMeanDescriptor(handle, &op_desc,
+                                             _attributes->y->desc(),
+                                             _attributes->x->desc(),
+                                             _attributes->dim),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create ReduceMean descriptor"));
+
+    auto x = _attributes->x->to(device, device_id);
+    auto y = _attributes->y->to(device, device_id);
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetReduceMeanWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size"));
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace"));
+    }
+
+    CHECK_OR(infiniopReduceMean(op_desc,
+                             workspace, workspace_size,
+                             y->data(),
+                             x->data(),
+                             nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "ReduceMean execution failed"));
+
+    try {
+        allClose(y, _attributes->ans, _rtol, _atol, _equal_nan);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopReduceMean(op_desc,
+                            workspace, workspace_size,
+                            y->data(),
+                            x->data(),
+                            nullptr);
+        },
+        warm_ups, iterations);
+
+    if (workspace != nullptr) {
+        infinirtFree(workspace);
+    }
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"dim"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"x", "ans", "y"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"y"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- x: " << _attributes->x->info() << std::endl;
+    oss << "- y: " << _attributes->y->info() << std::endl;
+    oss << "- dim=" << _attributes->dim << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::reduce_mean
diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
new file mode 100644
index 000000000..3682d7631
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
@@ -0,0 +1,67 @@
+#include "reduce_mean_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+
+namespace op::reduce_mean::cpu {
+
+Descriptor::~Descriptor() {}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto result = ReduceMeanInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(result);
+    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) {
+    const size_t batch_size = info->shape[0];
+    const size_t rows = info->shape[1];
+    const size_t cols = info->shape[2];  // 最后一维（规约维度）
+
+    const ptrdiff_t y_batch_stride = info->y_strides[0];
+    const ptrdiff_t y_row_stride = info->y_strides[1];
+    const ptrdiff_t x_batch_stride = info->x_strides[0];
+    const ptrdiff_t x_row_stride = info->x_strides[1];
+    const ptrdiff_t x_col_stride = info->x_strides[2];
+    
+    #pragma omp parallel for collapse(2)
+    for (size_t batch = 0; batch < batch_size; ++batch) {
+        for (size_t row = 0; row < rows; ++row) {
+            const T* input_start = x + batch * x_batch_stride + row * x_row_stride;
+            T* output_ptr = y + batch * y_batch_stride + row * y_row_stride;
+            float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols;
+            if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                *output_ptr = utils::cast<T>(mean);
+            } else {
+                *output_ptr = mean;
+            }
+        }
+    }
+    
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    if (_info.dtype == INFINI_DTYPE_F16) {
+        CHECK_STATUS(reduce_mean<fp16_t>(&_info, (fp16_t *)y, (const fp16_t *)x));
+    } else if (_info.dtype == INFINI_DTYPE_BF16) {
+        CHECK_STATUS(reduce_mean<bf16_t>(&_info, (bf16_t *)y, (const bf16_t *)x));
+    } else if (_info.dtype == INFINI_DTYPE_F32) {
+        CHECK_STATUS(reduce_mean<float>(&_info, (float *)y, (const float *)x));
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reduce_mean::cpu
diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h
new file mode 100644
index 000000000..f67601c62
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __REDUCE_MEAN_CPU_H__
+#define __REDUCE_MEAN_CPU_H__
+#include "../reduce_mean.h"
+
+DESCRIPTOR(cpu)
+
+#endif
diff --git a/src/infiniop/ops/reduce_mean/info.h b/src/infiniop/ops/reduce_mean/info.h
new file mode 100644
index 000000000..41785a082
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/info.h
@@ -0,0 +1,81 @@
+﻿#ifndef __REDUCE_MEAN_INFO_H__
+#define __REDUCE_MEAN_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::reduce_mean {
+
+class ReduceMeanInfo {
+    ReduceMeanInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+
+    std::vector<size_t> shape;
+    std::vector<ptrdiff_t> y_strides;
+    std::vector<ptrdiff_t> x_strides;
+
+    static utils::Result<ReduceMeanInfo> create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, size_t dim) {
+        auto dtype = y_desc->dtype();
+        if (dtype != x_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+
+        size_t ndim = y_desc->ndim();
+        if (x_desc->ndim() != ndim) {
+            CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE);
+        }
+        CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape());
+        if(ndim == 3){
+            std::vector<size_t> shape = x_desc->shape();
+            std::vector<ptrdiff_t> y_strides = y_desc->strides();
+            std::vector<ptrdiff_t> x_strides = x_desc->strides();
+            if (dim != 2){
+                std::swap(shape[dim], shape[2]);
+                std::swap(y_strides[dim], y_strides[2]);
+                std::swap(x_strides[dim], x_strides[2]);
+            }
+            return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
+                dtype, shape, y_strides, x_strides});
+        }
+        else if (ndim == 2){
+            std::vector<size_t> shape = x_desc->shape();
+            std::vector<ptrdiff_t> y_strides = y_desc->strides();
+            std::vector<ptrdiff_t> x_strides = x_desc->strides();
+            if (dim != 1){
+                std::swap(shape[dim], shape[1]);
+                std::swap(y_strides[dim], y_strides[1]);
+                std::swap(x_strides[dim], x_strides[1]);
+            }
+            shape.insert(shape.begin(), 1);
+            y_strides.insert(y_strides.begin(), 0);
+            x_strides.insert(x_strides.begin(), 0);
+            return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
+                dtype, shape, y_strides, x_strides});
+        }
+        else if (ndim == 1){
+            std::vector<size_t> shape = {1, 1, (x_desc->shape())[0]};
+            std::vector<ptrdiff_t> y_strides = {0, 0, (y_desc->strides())[0]};
+            std::vector<ptrdiff_t> x_strides = {0, 0, (x_desc->strides())[0]};
+            return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
+                dtype, shape, y_strides, x_strides});
+        }
+        else if (ndim == 0){
+            std::vector<size_t> shape = {1, 1, 1};
+            std::vector<ptrdiff_t> y_strides = {0, 0, 0};
+            std::vector<ptrdiff_t> x_strides = {0, 0, 0};
+            return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
+                dtype, shape, y_strides, x_strides});
+        }
+        else{
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+    }
+};
+
+} // namespace op::reduce_mean
+
+#endif // __REDUCE_MEAN_INFO_H__
diff --git a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h
new file mode 100644
index 000000000..a105724d6
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h
@@ -0,0 +1,8 @@
+#ifndef __REDUCE_MEAN_METAX_H__
+#define __REDUCE_MEAN_METAX_H__
+
+#include "../reduce_mean.h"
+
+DESCRIPTOR(metax)
+
+#endif
diff --git a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca
new file mode 100644
index 000000000..9abd888f6
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca
@@ -0,0 +1,94 @@
+#include "../../../devices/metax/metax_common.h"
+#include "reduce_mean_metax.h"
+
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../nvidia/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_METAX_KERNEL ReduceMean(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    ReduceMeanKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h, x_stride_w);
+}
+
+namespace op::reduce_mean::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto info = ReduceMeanInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t batch_size, size_t height, size_t width,
+                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
+                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, 
+                            hcStream_t stream) {
+    dim3 grid(uint32_t(batch_size), uint32_t(height), 1);
+    if (dtype == INFINI_DTYPE_F16) {
+        ReduceMean<BLOCK_SIZE, half, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
+                                              batch_size, height, width,
+                                              y_stride_b, y_stride_h,
+                                              x_stride_b, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        ReduceMean<BLOCK_SIZE, __hpcc_bfloat16, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x,
+                                              batch_size, height, width,
+                                              y_stride_b, y_stride_h,
+                                              x_stride_b, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        ReduceMean<BLOCK_SIZE, float, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
+                                              batch_size, height, width,
+                                              y_stride_b, y_stride_h,
+                                              x_stride_b, x_stride_h, x_stride_w);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     void *stream_) const {
+    hcStream_t stream = (hcStream_t)stream_;
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],
+            _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],
+            _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::reduce_mean::metax
diff --git a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
new file mode 100644
index 000000000..c1dbc3fa8
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
@@ -0,0 +1,26 @@
+﻿#ifndef __REDUCE_MEAN_KERNEL_CUH__
+#define __REDUCE_MEAN_KERNEL_CUH__
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ void ReduceMeanKernel(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+
+    Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_h;
+    const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_h;
+
+    // [Reduce] Find the sum of each updated row and store in shared memory
+    __shared__ Tcompute mean_;
+    Tcompute sum_0 = op::common_cuda::reduce_op::sum<BLOCK_SIZE, Tdata, Tcompute>(x, width, x_stride_w);
+    if (threadIdx.x == 0) {
+        mean_ = sum_0/width;
+    }
+    __syncthreads();
+
+    // [Elementwise] Divide each element by the sum and store in shared memory
+    *y = mean_;
+}
+
+#endif // __REDUCE_MEAN_KERNEL_CUH__
diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
new file mode 100644
index 000000000..170c956e9
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
@@ -0,0 +1,98 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "reduce_mean_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_CUDA_KERNEL ReduceMean(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    ReduceMeanKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h, x_stride_w);
+}
+
+namespace op::reduce_mean::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto info = ReduceMeanInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t batch_size, size_t height, size_t width,
+                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
+                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, 
+                            cudaStream_t stream) {
+    dim3 grid(uint32_t(batch_size), uint32_t(height), 1);
+    if (dtype == INFINI_DTYPE_F16) {
+        ReduceMean<BLOCK_SIZE, half, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
+                                              batch_size, height, width,
+                                              y_stride_b, y_stride_h,
+                                              x_stride_b, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        ReduceMean<BLOCK_SIZE, __nv_bfloat16, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x,
+                                              batch_size, height, width,
+                                              y_stride_b, y_stride_h,
+                                              x_stride_b, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        ReduceMean<BLOCK_SIZE, float, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
+                                              batch_size, height, width,
+                                              y_stride_b, y_stride_h,
+                                              x_stride_b, x_stride_h, x_stride_w);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],
+            _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],
+            _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],
+            _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::reduce_mean::nvidia
diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh
new file mode 100644
index 000000000..be16b4491
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __REDUCE_MEAN_NVIDIA_H__
+#define __REDUCE_MEAN_NVIDIA_H__
+
+#include "../reduce_mean.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
diff --git a/src/infiniop/ops/reduce_mean/operator.cc b/src/infiniop/ops/reduce_mean/operator.cc
new file mode 100644
index 000000000..32e4e3619
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/operator.cc
@@ -0,0 +1,182 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/reduce_mean.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/reduce_mean_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/reduce_mean_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/reduce_mean_metax.h"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/reduce_mean_ascend.h"
+#endif
+
+__C infiniStatus_t infiniopCreateReduceMeanDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceMeanDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+
+#define CREATE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                        \
+        return op::reduce_mean::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                   \
+            reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                                   \
+            x_desc,                                                                 \
+            dim);
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax)
+#endif
+// #ifdef ENABLE_ASCEND_API
+//         CREATE(INFINI_DEVICE_ASCEND, ascend)
+// #endif
+// #ifdef ENABLE_CAMBRICON_MLU
+//     case DevCambriconMlu: {
+//         return bangCreateCausalSoftmaxDescriptor((BangHandle_t)handle, (CausalSoftmaxBangDescriptor_t *)desc_ptr, y_desc);
+//         // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
+//     }
+// #endif
+// #ifdef ENABLE_MTHREADS_GPU
+//     case DevMthreadsGpu: {
+//         return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc);
+//     }
+// #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                          \
+    case CASE:                                                                                        \
+        *size = reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+// #ifdef ENABLE_ASCEND_API
+//         GET(INFINI_DEVICE_ASCEND, ascend)
+// #endif
+// #ifdef ENABLE_CAMBRICON_MLU
+//     case DevCambriconMlu: {
+//         return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t)desc, size);
+//         // return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size);
+//     }
+
+// #endif
+// #ifdef ENABLE_MTHREADS_GPU
+//     case DevMthreadsGpu: {
+//         return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size);
+//     }
+// #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopReduceMean(
+    infiniopReduceMeanDescriptor_t desc,
+    void *workspace, size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                                 \
+        return reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor *>(desc)->calculate( \
+            workspace, workspace_size, y, x, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax)
+#endif
+// #ifdef ENABLE_ASCEND_API
+//         CALCULATE(INFINI_DEVICE_ASCEND, ascend)
+// #endif
+// #ifdef ENABLE_CAMBRICON_MLU
+//     case DevCambriconMlu: {
+//         return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t)desc, workspace, workspace_size, data, stream);
+//         // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
+//     }
+// #endif
+// #ifdef ENABLE_MTHREADS_GPU
+//     case DevMthreadsGpu: {
+//         return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream);
+//     }
+// #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                      \
+        delete reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax)
+#endif
+// #ifdef ENABLE_ASCEND_API
+//         DESTROY(INFINI_DEVICE_ASCEND, ascend)
+// #endif
+// #ifdef ENABLE_CAMBRICON_MLU
+//     case DevCambriconMlu: {
+//         return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t)desc);
+//         // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
+//     }
+// #endif
+// #ifdef ENABLE_MTHREADS_GPU
+//     case DevMthreadsGpu:
+//         return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc);
+// #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
diff --git a/src/infiniop/ops/reduce_mean/reduce_mean.h b/src/infiniop/ops/reduce_mean/reduce_mean.h
new file mode 100644
index 000000000..58fa566ea
--- /dev/null
+++ b/src/infiniop/ops/reduce_mean/reduce_mean.h
@@ -0,0 +1,47 @@
+#ifndef REDUCE_MEAN_H
+#define REDUCE_MEAN_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::reduce_mean::NAMESPACE {                    \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        ReduceMeanInfo _info;                                 \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            ReduceMeanInfo info,                              \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t y_desc,                   \
+            infiniopTensorDescriptor_t x_desc,                   \
+            size_t dim);                                         \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *y,                                             \
+            const void *x,                                       \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // REDUCE_MEAN_H
diff --git a/src/infiniop/reduce/cuda/reduce.cuh b/src/infiniop/reduce/cuda/reduce.cuh
index a1d2c2501..0a30a3a26 100644
--- a/src/infiniop/reduce/cuda/reduce.cuh
+++ b/src/infiniop/reduce/cuda/reduce.cuh
@@ -61,6 +61,24 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
     return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
 }
 
+// Sum(x) on non-contiguous data of length count
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ __forceinline__ Tcompute sum(
+    const Tdata *data_ptr, 
+    size_t count, 
+    ptrdiff_t stride)
+{
+    Tcompute s = 0;
+
+    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
+        s += Tcompute(data_ptr[i * stride]);
+    }
+
+    using BlockReduce = cub::BlockReduce<Tcompute, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    return BlockReduce(temp_storage).Sum(s);
+}
 } // namespace op::common_cuda::reduce_op
 
 #endif
diff --git a/src/utils/check.h b/src/utils/check.h
index 7f4a2bdd9..72a0995fd 100644
--- a/src/utils/check.h
+++ b/src/utils/check.h
@@ -59,4 +59,22 @@
 
 #define CHECK_SAME_STRIDES(FIRST, ...) CHECK_SAME_VEC(INFINI_STATUS_BAD_TENSOR_STRIDES, FIRST, __VA_ARGS__)
 
+#define CHECK_REDUCE_SHAPE(INPUT_SHAPE, DIM, EXPECTED_SHAPE)     \
+    do {                                                         \
+        if (INPUT_SHAPE.empty()) {                               \
+            if (!EXPECTED_SHAPE.empty()) {                       \
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;           \
+            }                                                    \
+            break;                                               \
+        }                                                        \
+        if (DIM >= INPUT_SHAPE.size()) {                         \
+            return INFINI_STATUS_BAD_PARAM;                      \
+        }                                                        \
+        std::vector<size_t> reduced_shape = INPUT_SHAPE;         \
+        reduced_shape[DIM] = 1;                                  \
+        if (reduced_shape != EXPECTED_SHAPE) {                   \
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;               \
+        }                                                        \
+    } while (0)
+
 #endif // INFINIUTILS_CHECK_H
diff --git a/test/infiniop-test/test_generate/testcases/reduce_mean.py b/test/infiniop-test/test_generate/testcases/reduce_mean.py
new file mode 100644
index 000000000..0e05d4240
--- /dev/null
+++ b/test/infiniop-test/test_generate/testcases/reduce_mean.py
@@ -0,0 +1,111 @@
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+from enum import Enum, auto
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+
+
+def reduce_mean(x, dim):
+    if isinstance(x, np.float64):
+        return x
+    return x.mean(axis=dim, keepdims=True)
+
+
+def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray:
+    return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001
+
+
+class ReduceMeanTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        x: np.ndarray,
+        y: np.ndarray,
+        shape_x: List[int] | None,
+        shape_y: List[int] | None,
+        stride_x: List[int] | None,
+        stride_y: List[int] | None,
+        dim: int = 0,
+    ):
+        super().__init__("reduce_mean")
+        self.x = x
+        self.y = y
+        self.shape_x=shape_x
+        self.shape_y=shape_y
+        self.stride_x = stride_x
+        self.stride_y = stride_y
+        self.dim = dim
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        print(self.shape_y, self.shape_x, self.stride_y, self.stride_x, self.dim)
+        if self.shape_x is not None:
+            test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x)
+        if self.shape_y is not None:
+            test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y)
+        if self.stride_x is not None:
+            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x))
+        test_writer.add_array(
+            test_writer.gguf_key("y.strides"),
+            gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
+        )
+        test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim)
+        test_writer.add_tensor(
+            test_writer.gguf_key("x"),
+            self.x,
+            raw_dtype=np_dtype_to_ggml(self.x.dtype),
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("y"),
+            self.y,
+            raw_dtype=np_dtype_to_ggml(self.y.dtype),
+        )
+        ans = reduce_mean(
+            self.x.astype(np.float64), self.dim
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("reduce_mean.gguf")
+    test_cases = []
+    # ==============================================================================
+    #  Configuration
+    # ==============================================================================
+    # These are not meant to be imported from other modules
+    _TEST_CASES_ = [
+    # y_shape, x_shape, y_stride, x_stride, dim
+    # ((0,), (0,), (0,), (0,), 0),
+    ((1, ), (32, ), None, None, 0),
+    ((1, 4), (1, 4), None, None, 0),
+    ((1, 1), (1, 4), None, None, 1),
+    ((16, 1), (16, 2048), None, None, 1),
+    ((1, 16), (2048, 16), None, None, 0),
+    ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1),
+    ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0),
+    ((4, 4, 1), (4, 4, 2048), None, None, 2),
+    ((1, 4, 4), (2048, 4, 4), None, None, 0),
+    ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
+]
+    _TENSOR_DTYPES_ = [np.float16, np.float32]
+
+    for dtype in _TENSOR_DTYPES_:
+        for shape_y, shape_x, stride_y, stride_x, dim in _TEST_CASES_:
+            x = random_tensor(shape_x, dtype)
+            y = np.empty(tuple(0 for _ in shape_y), dtype=dtype)
+            test_case = ReduceMeanTestCase(
+                x,
+                y,
+                shape_x,
+                shape_y,
+                stride_x,
+                stride_y,
+                dim,
+            )
+            test_cases.append(test_case)
+            
+    test_writer.add_tests(test_cases)
+    test_writer.save()
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index e92e77105..7f147a8b2 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -489,3 +489,35 @@ def conv_(lib):
     lib.infiniopDestroyConvDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
     ]
+
+@OpRegister.operator
+def reduce_mean_(lib):
+    lib.infiniopCreateReduceMeanDescriptor.restype = c_int32
+    lib.infiniopCreateReduceMeanDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+
+    lib.infiniopGetReduceMeanWorkspaceSize.restype = c_int32
+    lib.infiniopGetReduceMeanWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopReduceMean.restype = c_int32
+    lib.infiniopReduceMean.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyReduceMeanDescriptor.restype = c_int32
+    lib.infiniopDestroyReduceMeanDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
\ No newline at end of file
diff --git a/test/infiniop/reduce_mean.py b/test/infiniop/reduce_mean.py
new file mode 100644
index 000000000..e56663e44
--- /dev/null
+++ b/test/infiniop/reduce_mean.py
@@ -0,0 +1,150 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # y_shape, x_shape, y_stride, x_stride, dim
+    ((), (), None, None, 0),
+    ((1, ), (32, ), None, None, 0),
+    ((1, 4), (1, 4), None, None, 0),
+    ((1, 1), (1, 4), None, None, 1),
+    ((16, 1), (16, 2048), None, None, 1),
+    ((1, 16), (2048, 16), None, None, 0),
+    ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1),
+    ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0),
+    ((4, 4, 1), (4, 4, 2048), None, None, 2),
+    ((1, 4, 4), (2048, 4, 4), None, None, 0),
+    ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
+]
+
+# x types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+_TEST_CASES = _TEST_CASES_
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def reduce_mean(x, dim):
+    return x.mean(dim=dim,keepdim=True)
+
+
+def test(
+    handle,
+    device,
+    y_shape,
+    x_shape,
+    y_stride,
+    x_stride,
+    dim,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing Reduce_Mean on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape}"
+        f" y_stride:{y_stride} x_stride:{x_stride} dim:{dim} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    x = TestTensor(x_shape, x_stride, dtype, device)
+    ans = reduce_mean(x.torch_tensor(), dim)
+
+    y = TestTensor(y_shape, y_stride, dtype, device)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateReduceMeanDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, ctypes.c_size_t(dim)
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x.destroy_desc()
+    y.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetReduceMeanWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_reduce_mean():
+        check_error(
+            LIBINFINIOP.infiniopReduceMean(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_reduce_mean()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: causal_softmax(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_causal_softmax(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyReduceMeanDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")

From 73eee8adac37b34ede20ad098102e62c1a2e051e Mon Sep 17 00:00:00 2001
From: S-hhhhh <2320230838@mail.nankai.edu.cn>
Date: Sat, 16 Aug 2025 13:30:37 +0800
Subject: [PATCH 03/11] update reduce_mean

---
 src/infiniop-test/src/ops/reduce_mean.cpp     | 39 ++++++++++---------
 .../ops/reduce_mean/nvidia/kernel.cuh         |  7 ++--
 2 files changed, 24 insertions(+), 22 deletions(-)

diff --git a/src/infiniop-test/src/ops/reduce_mean.cpp b/src/infiniop-test/src/ops/reduce_mean.cpp
index 2c3ddceaf..114ab2981 100644
--- a/src/infiniop-test/src/ops/reduce_mean.cpp
+++ b/src/infiniop-test/src/ops/reduce_mean.cpp
@@ -6,8 +6,8 @@
 
 namespace infiniop_test::reduce_mean {
 struct Test::Attributes {
-    std::shared_ptr<Tensor> x;
-    std::shared_ptr<Tensor> y;
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
     std::shared_ptr<Tensor> ans;
     size_t dim;
 };
@@ -20,16 +20,17 @@ std::shared_ptr<Test> Test::build(
     test->_attributes = new Attributes();
 
     if (attributes.find("dim") == attributes.end()
-        || tensors.find("x") == tensors.end()
+        || tensors.find("input") == tensors.end()
         || tensors.find("ans") == tensors.end()
-        || tensors.find("y") == tensors.end()) {
+        || tensors.find("output") == tensors.end()) {
+        std::cout << "所有的键：" << std::endl;
         throw std::runtime_error("Invalid Test: Missing attributes or tensors");
     }
 
     test->_attributes->dim = size_t(*reinterpret_cast<uint64_t *>(attributes["dim"].data()));
     test->_attributes->ans = tensors["ans"];
-    test->_attributes->x = tensors["x"];
-    test->_attributes->y = tensors["y"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
 
     return test;
 }
@@ -40,13 +41,13 @@ std::shared_ptr<infiniop_test::Result> Test::run(
 
     infiniopReduceMeanDescriptor_t op_desc;
     CHECK_OR(infiniopCreateReduceMeanDescriptor(handle, &op_desc,
-                                             _attributes->y->desc(),
-                                             _attributes->x->desc(),
+                                             _attributes->output->desc(),
+                                             _attributes->input->desc(),
                                              _attributes->dim),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed to create ReduceMean descriptor"));
 
-    auto x = _attributes->x->to(device, device_id);
-    auto y = _attributes->y->to(device, device_id);
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
 
     size_t workspace_size;
     CHECK_OR(infiniopGetReduceMeanWorkspaceSize(op_desc, &workspace_size),
@@ -59,13 +60,13 @@ std::shared_ptr<infiniop_test::Result> Test::run(
 
     CHECK_OR(infiniopReduceMean(op_desc,
                              workspace, workspace_size,
-                             y->data(),
-                             x->data(),
+                             output->data(),
+                             input->data(),
                              nullptr),
              return TEST_FAILED(OP_EXECUTION_FAILED, "ReduceMean execution failed"));
 
     try {
-        allClose(y, _attributes->ans, _rtol, _atol, _equal_nan);
+        allClose(output, _attributes->ans, _rtol, _atol, _equal_nan);
     } catch (const std::exception &e) {
         return TEST_FAILED(RESULT_INCORRECT, e.what());
     }
@@ -76,8 +77,8 @@ std::shared_ptr<infiniop_test::Result> Test::run(
         [=]() {
             infiniopReduceMean(op_desc,
                             workspace, workspace_size,
-                            y->data(),
-                            x->data(),
+                            output->data(),
+                            input->data(),
                             nullptr);
         },
         warm_ups, iterations);
@@ -94,18 +95,18 @@ std::vector<std::string> Test::attribute_names() {
 }
 
 std::vector<std::string> Test::tensor_names() {
-    return {"x", "ans", "y"};
+    return {"input", "ans", "output"};
 }
 
 std::vector<std::string> Test::output_names() {
-    return {"y"};
+    return {"output"};
 }
 
 std::string Test::toString() const {
     std::ostringstream oss;
     oss << op_name() << std::endl;
-    oss << "- x: " << _attributes->x->info() << std::endl;
-    oss << "- y: " << _attributes->y->info() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
     oss << "- dim=" << _attributes->dim << std::endl;
     oss << std::scientific << std::setprecision(2);
     oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
diff --git a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
index c1dbc3fa8..a64d6cfcb 100644
--- a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
+++ b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
@@ -15,12 +15,13 @@ __device__ void ReduceMeanKernel(
     __shared__ Tcompute mean_;
     Tcompute sum_0 = op::common_cuda::reduce_op::sum<BLOCK_SIZE, Tdata, Tcompute>(x, width, x_stride_w);
     if (threadIdx.x == 0) {
-        mean_ = sum_0/width;
+        // mean_ = sum_0/width;
+        *y = sum_0/width;
     }
-    __syncthreads();
+    // __syncthreads();
 
     // [Elementwise] Divide each element by the sum and store in shared memory
-    *y = mean_;
+    // *y = mean_;
 }
 
 #endif // __REDUCE_MEAN_KERNEL_CUH__

From c8592e865eba82f0546d773a102c790aa3ade86a Mon Sep 17 00:00:00 2001
From: S-hhhhh <2320230838@mail.nankai.edu.cn>
Date: Sat, 16 Aug 2025 14:46:43 +0800
Subject: [PATCH 04/11] update reduce_mean

---
 .../ops/reduce_mean/cpu/reduce_mean_cpu.cc    | 39 +++++---
 src/infiniop/ops/reduce_mean/info.h           | 94 ++++++++++++-------
 .../reduce_mean/metax/reduce_mean_metax.maca  | 44 ++++-----
 .../ops/reduce_mean/nvidia/kernel.cuh         | 10 +-
 .../reduce_mean/nvidia/reduce_mean_nvidia.cu  | 39 ++++----
 5 files changed, 133 insertions(+), 93 deletions(-)

diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
index 3682d7631..b56184ef5 100644
--- a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
+++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
@@ -21,25 +21,36 @@ infiniStatus_t Descriptor::create(
 template <typename T>
 infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) {
     const size_t batch_size = info->shape[0];
-    const size_t rows = info->shape[1];
-    const size_t cols = info->shape[2];  // 最后一维（规约维度）
+    const size_t channels = info->shape[1];
+    const size_t rows = info->shape[2];
+    const size_t cols = info->shape[3];  // 规约维度
 
     const ptrdiff_t y_batch_stride = info->y_strides[0];
-    const ptrdiff_t y_row_stride = info->y_strides[1];
+    const ptrdiff_t y_channel_stride = info->y_strides[1];
+    const ptrdiff_t y_row_stride = info->y_strides[2];
     const ptrdiff_t x_batch_stride = info->x_strides[0];
-    const ptrdiff_t x_row_stride = info->x_strides[1];
-    const ptrdiff_t x_col_stride = info->x_strides[2];
+    const ptrdiff_t x_channel_stride = info->x_strides[1];
+    const ptrdiff_t x_row_stride = info->x_strides[2];
+    const ptrdiff_t x_col_stride = info->x_strides[3];
     
-    #pragma omp parallel for collapse(2)
+    #pragma omp parallel for collapse(3)
     for (size_t batch = 0; batch < batch_size; ++batch) {
-        for (size_t row = 0; row < rows; ++row) {
-            const T* input_start = x + batch * x_batch_stride + row * x_row_stride;
-            T* output_ptr = y + batch * y_batch_stride + row * y_row_stride;
-            float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols;
-            if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
-                *output_ptr = utils::cast<T>(mean);
-            } else {
-                *output_ptr = mean;
+        for (size_t channel = 0; channel < channels; ++channel) {
+            for (size_t row = 0; row < rows; ++row) {
+                const T* input_start = x + batch * x_batch_stride 
+                                    + channel * x_channel_stride 
+                                    + row * x_row_stride;
+                T* output_ptr = y + batch * y_batch_stride 
+                              + channel * y_channel_stride 
+                              + row * y_row_stride;
+                
+                float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols;
+                
+                if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                    *output_ptr = utils::cast<T>(mean);
+                } else {
+                    *output_ptr = mean;
+                }
             }
         }
     }
diff --git a/src/infiniop/ops/reduce_mean/info.h b/src/infiniop/ops/reduce_mean/info.h
index 41785a082..428675989 100644
--- a/src/infiniop/ops/reduce_mean/info.h
+++ b/src/infiniop/ops/reduce_mean/info.h
@@ -29,50 +29,76 @@ class ReduceMeanInfo {
             CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE);
         }
         CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape());
-        if(ndim == 3){
-            std::vector<size_t> shape = x_desc->shape();
-            std::vector<ptrdiff_t> y_strides = y_desc->strides();
-            std::vector<ptrdiff_t> x_strides = x_desc->strides();
-            if (dim != 2){
-                std::swap(shape[dim], shape[2]);
-                std::swap(y_strides[dim], y_strides[2]);
-                std::swap(x_strides[dim], x_strides[2]);
-            }
+        if (ndim > 4) return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        else if (ndim == 0){
+            std::vector<size_t> shape = {1, 1, 1, 1};
+            std::vector<ptrdiff_t> y_strides = {0, 0, 0, 0};
+            std::vector<ptrdiff_t> x_strides = {0, 0, 0, 0};
             return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
                 dtype, shape, y_strides, x_strides});
         }
-        else if (ndim == 2){
+        else{
             std::vector<size_t> shape = x_desc->shape();
             std::vector<ptrdiff_t> y_strides = y_desc->strides();
             std::vector<ptrdiff_t> x_strides = x_desc->strides();
-            if (dim != 1){
-                std::swap(shape[dim], shape[1]);
-                std::swap(y_strides[dim], y_strides[1]);
-                std::swap(x_strides[dim], x_strides[1]);
+            if (dim != (shape.size() - 1)){
+                std::swap(shape[dim], shape[shape.size() - 1]);
+                std::swap(y_strides[dim], y_strides[shape.size() - 1]);
+                std::swap(x_strides[dim], x_strides[shape.size() - 1]);
+            }
+            while (shape.size() < 4){
+                shape.insert(shape.begin(), 1);
+                y_strides.insert(y_strides.begin(), 0);
+                x_strides.insert(x_strides.begin(), 0);
             }
-            shape.insert(shape.begin(), 1);
-            y_strides.insert(y_strides.begin(), 0);
-            x_strides.insert(x_strides.begin(), 0);
-            return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
-                dtype, shape, y_strides, x_strides});
-        }
-        else if (ndim == 1){
-            std::vector<size_t> shape = {1, 1, (x_desc->shape())[0]};
-            std::vector<ptrdiff_t> y_strides = {0, 0, (y_desc->strides())[0]};
-            std::vector<ptrdiff_t> x_strides = {0, 0, (x_desc->strides())[0]};
-            return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
-                dtype, shape, y_strides, x_strides});
-        }
-        else if (ndim == 0){
-            std::vector<size_t> shape = {1, 1, 1};
-            std::vector<ptrdiff_t> y_strides = {0, 0, 0};
-            std::vector<ptrdiff_t> x_strides = {0, 0, 0};
             return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
                 dtype, shape, y_strides, x_strides});
         }
-        else{
-            return INFINI_STATUS_BAD_TENSOR_SHAPE;
-        }
+
+        // if(ndim == 3){
+        //     std::vector<size_t> shape = x_desc->shape();
+        //     std::vector<ptrdiff_t> y_strides = y_desc->strides();
+        //     std::vector<ptrdiff_t> x_strides = x_desc->strides();
+        //     if (dim != 2){
+        //         std::swap(shape[dim], shape[2]);
+        //         std::swap(y_strides[dim], y_strides[2]);
+        //         std::swap(x_strides[dim], x_strides[2]);
+        //     }
+        //     return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
+        //         dtype, shape, y_strides, x_strides});
+        // }
+        // else if (ndim == 2){
+        //     std::vector<size_t> shape = x_desc->shape();
+        //     std::vector<ptrdiff_t> y_strides = y_desc->strides();
+        //     std::vector<ptrdiff_t> x_strides = x_desc->strides();
+        //     if (dim != 1){
+        //         std::swap(shape[dim], shape[1]);
+        //         std::swap(y_strides[dim], y_strides[1]);
+        //         std::swap(x_strides[dim], x_strides[1]);
+        //     }
+        //     shape.insert(shape.begin(), 1);
+        //     y_strides.insert(y_strides.begin(), 0);
+        //     x_strides.insert(x_strides.begin(), 0);
+        //     return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
+        //         dtype, shape, y_strides, x_strides});
+        // }
+        // else if (ndim == 1){
+        //     std::vector<size_t> shape = {1, 1, (x_desc->shape())[0]};
+        //     std::vector<ptrdiff_t> y_strides = {0, 0, (y_desc->strides())[0]};
+        //     std::vector<ptrdiff_t> x_strides = {0, 0, (x_desc->strides())[0]};
+        //     return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
+        //         dtype, shape, y_strides, x_strides});
+        // }
+        // else if (ndim == 0){
+        //     std::vector<size_t> shape = {1, 1, 1};
+        //     std::vector<ptrdiff_t> y_strides = {0, 0, 0};
+        //     std::vector<ptrdiff_t> x_strides = {0, 0, 0};
+        //     return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
+        //         dtype, shape, y_strides, x_strides});
+        // }
+        // else{
+        //     return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        // }
     }
 };
 
diff --git a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca
index 9abd888f6..0eaf1c9cb 100644
--- a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca
+++ b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca
@@ -11,10 +11,10 @@
 template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
 INFINIOP_METAX_KERNEL ReduceMean(
     Tdata *y_, const Tdata *x_,
-    size_t batch, size_t height, size_t width,
-    ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
-    ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
-    ReduceMeanKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h, x_stride_w);
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    ReduceMeanKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w);
 }
 
 namespace op::reduce_mean::metax {
@@ -43,29 +43,29 @@ infiniStatus_t Descriptor::create(
 
 template <unsigned int BLOCK_SIZE>
 infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
-                            size_t batch_size, size_t height, size_t width,
-                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
-                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, 
+                            size_t batch_size, size_t channels, size_t height, size_t width,
+                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w,
                             hcStream_t stream) {
-    dim3 grid(uint32_t(batch_size), uint32_t(height), 1);
+    dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
     if (dtype == INFINI_DTYPE_F16) {
         ReduceMean<BLOCK_SIZE, half, float>
             <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
-                                              batch_size, height, width,
-                                              y_stride_b, y_stride_h,
-                                              x_stride_b, x_stride_h, x_stride_w);
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
     } else if (dtype == INFINI_DTYPE_BF16) {
         ReduceMean<BLOCK_SIZE, __hpcc_bfloat16, float>
             <<<grid, BLOCK_SIZE, 0, stream>>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x,
-                                              batch_size, height, width,
-                                              y_stride_b, y_stride_h,
-                                              x_stride_b, x_stride_h, x_stride_w);
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
     } else if (dtype == INFINI_DTYPE_F32) {
         ReduceMean<BLOCK_SIZE, float, float>
             <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
-                                              batch_size, height, width,
-                                              y_stride_b, y_stride_h,
-                                              x_stride_b, x_stride_h, x_stride_w);
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
     } else {
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
@@ -79,12 +79,14 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
     hcStream_t stream = (hcStream_t)stream_;
     if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
         CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
-            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],
-            _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream));
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
         CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_512>(
-            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],
-            _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream));
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else {
         return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
     }
diff --git a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
index a64d6cfcb..44fb2226e 100644
--- a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
+++ b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
@@ -4,12 +4,12 @@
 template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
 __device__ void ReduceMeanKernel(
     Tdata *y_, const Tdata *x_,
-    size_t batch, size_t height, size_t width,
-    ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
-    ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w)  {
 
-    Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_h;
-    const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_h;
+    Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h;
+    const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h;
 
     // [Reduce] Find the sum of each updated row and store in shared memory
     __shared__ Tcompute mean_;
diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
index 170c956e9..1487e080f 100644
--- a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
+++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
@@ -11,10 +11,10 @@
 template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
 INFINIOP_CUDA_KERNEL ReduceMean(
     Tdata *y_, const Tdata *x_,
-    size_t batch, size_t height, size_t width,
-    ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
-    ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
-    ReduceMeanKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h, x_stride_w);
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    ReduceMeanKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w);
 }
 
 namespace op::reduce_mean::nvidia {
@@ -43,29 +43,29 @@ infiniStatus_t Descriptor::create(
 
 template <unsigned int BLOCK_SIZE>
 infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
-                            size_t batch_size, size_t height, size_t width,
-                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_h,
-                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, 
+                            size_t batch_size, size_t channels, size_t height, size_t width,
+                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w,
                             cudaStream_t stream) {
-    dim3 grid(uint32_t(batch_size), uint32_t(height), 1);
+    dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
     if (dtype == INFINI_DTYPE_F16) {
         ReduceMean<BLOCK_SIZE, half, float>
             <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
-                                              batch_size, height, width,
-                                              y_stride_b, y_stride_h,
-                                              x_stride_b, x_stride_h, x_stride_w);
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
     } else if (dtype == INFINI_DTYPE_BF16) {
         ReduceMean<BLOCK_SIZE, __nv_bfloat16, float>
             <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x,
-                                              batch_size, height, width,
-                                              y_stride_b, y_stride_h,
-                                              x_stride_b, x_stride_h, x_stride_w);
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
     } else if (dtype == INFINI_DTYPE_F32) {
         ReduceMean<BLOCK_SIZE, float, float>
             <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
-                                              batch_size, height, width,
-                                              y_stride_b, y_stride_h,
-                                              x_stride_b, x_stride_h, x_stride_w);
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
     } else {
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
@@ -79,8 +79,9 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
     cudaStream_t stream = (cudaStream_t)stream_;
     if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
-            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],
-            _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream));
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
             y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],

From dfcdeb2116fe760ef5589f14c82795c7b50c1aa4 Mon Sep 17 00:00:00 2001
From: S-hhhhh <2320230838@mail.nankai.edu.cn>
Date: Sat, 16 Aug 2025 16:38:08 +0800
Subject: [PATCH 05/11] =?UTF-8?q?=E6=B7=BB=E5=8A=A0ReduceMean,=E6=9B=B4?=
 =?UTF-8?q?=E6=96=B0ReduceMax?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 include/infiniop.h                            |   1 +
 include/infiniop/ops/reduce_max.h             |  27 +++
 scripts/python_test.py                        |   1 +
 src/infiniop-test/include/ops.hpp             |   2 +
 src/infiniop-test/src/ops/reduce_max.cpp      | 119 ++++++++++++
 src/infiniop-test/src/ops/reduce_mean.cpp     |   1 -
 .../ops/reduce_max/cpu/reduce_max_cpu.cc      |  78 ++++++++
 .../ops/reduce_max/cpu/reduce_max_cpu.h       |   7 +
 src/infiniop/ops/reduce_max/info.h            |  62 ++++++
 .../ops/reduce_max/metax/reduce_max_metax.h   |   8 +
 .../reduce_max/metax/reduce_max_metax.maca    |  96 ++++++++++
 src/infiniop/ops/reduce_max/nvidia/kernel.cuh |  23 +++
 .../reduce_max/nvidia/reduce_max_nvidia.cu    |  99 ++++++++++
 .../reduce_max/nvidia/reduce_max_nvidia.cuh   |   8 +
 src/infiniop/ops/reduce_max/operator.cc       | 181 ++++++++++++++++++
 src/infiniop/ops/reduce_max/reduce_max.h      |  47 +++++
 src/infiniop/ops/reduce_mean/info.h           |  45 -----
 src/infiniop/reduce/cuda/reduce.cuh           |  25 ++-
 .../test_generate/testcases/reduce_max.py     | 112 +++++++++++
 .../test_generate/testcases/reduce_mean.py    |  13 +-
 test/infiniop/libinfiniop/op_register.py      |  32 ++++
 test/infiniop/reduce_max.py                   | 150 +++++++++++++++
 test/infiniop/reduce_mean.py                  |   6 +-
 23 files changed, 1084 insertions(+), 59 deletions(-)
 create mode 100644 include/infiniop/ops/reduce_max.h
 create mode 100644 src/infiniop-test/src/ops/reduce_max.cpp
 create mode 100644 src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc
 create mode 100644 src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h
 create mode 100644 src/infiniop/ops/reduce_max/info.h
 create mode 100644 src/infiniop/ops/reduce_max/metax/reduce_max_metax.h
 create mode 100644 src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca
 create mode 100644 src/infiniop/ops/reduce_max/nvidia/kernel.cuh
 create mode 100644 src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu
 create mode 100644 src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh
 create mode 100644 src/infiniop/ops/reduce_max/operator.cc
 create mode 100644 src/infiniop/ops/reduce_max/reduce_max.h
 create mode 100644 test/infiniop-test/test_generate/testcases/reduce_max.py
 create mode 100644 test/infiniop/reduce_max.py

diff --git a/include/infiniop.h b/include/infiniop.h
index 85df0c703..f7ad9e02c 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -18,5 +18,6 @@
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/tensor_descriptor.h"
 #include "infiniop/ops/reduce_mean.h"
+#include "infiniop/ops/reduce_max.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/reduce_max.h b/include/infiniop/ops/reduce_max.h
new file mode 100644
index 000000000..42a3dd62d
--- /dev/null
+++ b/include/infiniop/ops/reduce_max.h
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_REDUCE_MAX_API_H__
+#define __INFINIOP_REDUCE_MAX_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopReduceMaxDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateReduceMaxDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceMaxDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    size_t dim);
+
+__C __export infiniStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopReduceMax(
+    infiniopReduceMaxDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream);
+
+__C __export infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc);
+
+#endif
diff --git a/scripts/python_test.py b/scripts/python_test.py
index 14588cd7d..89df98708 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -25,6 +25,7 @@ def run_tests(args):
         "sub.py",
         "swiglu.py",
         "reduce_mean.py",
+        "reduce_max.py",
     ]:
         result = subprocess.run(
             f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index c94ebac90..29e4e633e 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -17,6 +17,7 @@ DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(sub)
 DECLARE_INFINIOP_TEST(reduce_mean)
+DECLARE_INFINIOP_TEST(reduce_max)
 
 #define REGISTER_INFINIOP_TEST(name)                      \
     {                                                     \
@@ -45,6 +46,7 @@ DECLARE_INFINIOP_TEST(reduce_mean)
         REGISTER_INFINIOP_TEST(rearrange)      \
         REGISTER_INFINIOP_TEST(sub)            \
         REGISTER_INFINIOP_TEST(reduce_mean)    \
+        REGISTER_INFINIOP_TEST(reduce_max)    \
     }
 
 namespace infiniop_test {
diff --git a/src/infiniop-test/src/ops/reduce_max.cpp b/src/infiniop-test/src/ops/reduce_max.cpp
new file mode 100644
index 000000000..1b10d0e02
--- /dev/null
+++ b/src/infiniop-test/src/ops/reduce_max.cpp
@@ -0,0 +1,119 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::reduce_max {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> output;
+    std::shared_ptr<Tensor> ans;
+    size_t dim;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol, bool equal_nan) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol, equal_nan));
+    test->_attributes = new Attributes();
+
+    if (attributes.find("dim") == attributes.end()
+        || tensors.find("input") == tensors.end()
+        || tensors.find("ans") == tensors.end()
+        || tensors.find("output") == tensors.end()) {
+        throw std::runtime_error("Invalid Test: Missing attributes or tensors");
+    }
+
+    test->_attributes->dim = size_t(*reinterpret_cast<uint64_t *>(attributes["dim"].data()));
+    test->_attributes->ans = tensors["ans"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->output = tensors["output"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopReduceMaxDescriptor_t op_desc;
+    CHECK_OR(infiniopCreateReduceMaxDescriptor(handle, &op_desc,
+                                             _attributes->output->desc(),
+                                             _attributes->input->desc(),
+                                             _attributes->dim),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create ReduceMax descriptor"));
+
+    auto input = _attributes->input->to(device, device_id);
+    auto output = _attributes->output->to(device, device_id);
+
+    size_t workspace_size;
+    CHECK_OR(infiniopGetReduceMaxWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size"));
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace"));
+    }
+
+    CHECK_OR(infiniopReduceMax(op_desc,
+                             workspace, workspace_size,
+                             output->data(),
+                             input->data(),
+                             nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "ReduceMax execution failed"));
+
+    try {
+        allClose(output, _attributes->ans, _rtol, _atol, _equal_nan);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopReduceMax(op_desc,
+                            workspace, workspace_size,
+                            output->data(),
+                            input->data(),
+                            nullptr);
+        },
+        warm_ups, iterations);
+
+    if (workspace != nullptr) {
+        infinirtFree(workspace);
+    }
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"dim"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "ans", "output"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"output"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- output: " << _attributes->output->info() << std::endl;
+    oss << "- dim=" << _attributes->dim << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::reduce_max
diff --git a/src/infiniop-test/src/ops/reduce_mean.cpp b/src/infiniop-test/src/ops/reduce_mean.cpp
index 114ab2981..4f1aa4bdd 100644
--- a/src/infiniop-test/src/ops/reduce_mean.cpp
+++ b/src/infiniop-test/src/ops/reduce_mean.cpp
@@ -23,7 +23,6 @@ std::shared_ptr<Test> Test::build(
         || tensors.find("input") == tensors.end()
         || tensors.find("ans") == tensors.end()
         || tensors.find("output") == tensors.end()) {
-        std::cout << "所有的键：" << std::endl;
         throw std::runtime_error("Invalid Test: Missing attributes or tensors");
     }
 
diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc
new file mode 100644
index 000000000..409a4ca02
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc
@@ -0,0 +1,78 @@
+#include "reduce_max_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../reduce/cpu/reduce.h"
+
+namespace op::reduce_max::cpu {
+
+Descriptor::~Descriptor() {}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto result = ReduceMaxInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(result);
+    *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) {
+    const size_t batch_size = info->shape[0];
+    const size_t channels = info->shape[1];
+    const size_t rows = info->shape[2];
+    const size_t cols = info->shape[3];  // 规约维度
+
+    const ptrdiff_t y_batch_stride = info->y_strides[0];
+    const ptrdiff_t y_channel_stride = info->y_strides[1];
+    const ptrdiff_t y_row_stride = info->y_strides[2];
+    const ptrdiff_t x_batch_stride = info->x_strides[0];
+    const ptrdiff_t x_channel_stride = info->x_strides[1];
+    const ptrdiff_t x_row_stride = info->x_strides[2];
+    const ptrdiff_t x_col_stride = info->x_strides[3];
+    
+    #pragma omp parallel for collapse(3)
+    for (size_t batch = 0; batch < batch_size; ++batch) {
+        for (size_t channel = 0; channel < channels; ++channel) {
+            for (size_t row = 0; row < rows; ++row) {
+                const T* input_start = x + batch * x_batch_stride 
+                                    + channel * x_channel_stride 
+                                    + row * x_row_stride;
+                T* output_ptr = y + batch * y_batch_stride 
+                              + channel * y_channel_stride 
+                              + row * y_row_stride;
+                
+                float max = op::common_cpu::reduce_op::max(input_start, cols, x_col_stride);
+                
+                if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                    *output_ptr = utils::cast<T>(max);
+                } else {
+                    *output_ptr = max;
+                }
+            }
+        }
+    }
+    
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    if (_info.dtype == INFINI_DTYPE_F16) {
+        CHECK_STATUS(reduce_max<fp16_t>(&_info, (fp16_t *)y, (const fp16_t *)x));
+    } else if (_info.dtype == INFINI_DTYPE_BF16) {
+        CHECK_STATUS(reduce_max<bf16_t>(&_info, (bf16_t *)y, (const bf16_t *)x));
+    } else if (_info.dtype == INFINI_DTYPE_F32) {
+        CHECK_STATUS(reduce_max<float>(&_info, (float *)y, (const float *)x));
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reduce_max::cpu
diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h
new file mode 100644
index 000000000..bf6be9b74
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h
@@ -0,0 +1,7 @@
+#ifndef __REDUCE_MAX_CPU_H__
+#define __REDUCE_MAX_CPU_H__
+#include "../reduce_max.h"
+
+DESCRIPTOR(cpu)
+
+#endif
diff --git a/src/infiniop/ops/reduce_max/info.h b/src/infiniop/ops/reduce_max/info.h
new file mode 100644
index 000000000..0babca6e6
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/info.h
@@ -0,0 +1,62 @@
+﻿#ifndef __REDUCE_MAX_INFO_H__
+#define __REDUCE_MAX_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::reduce_max {
+
+class ReduceMaxInfo {
+    ReduceMaxInfo() = default;
+
+public:
+    infiniDtype_t dtype;
+
+    std::vector<size_t> shape;
+    std::vector<ptrdiff_t> y_strides;
+    std::vector<ptrdiff_t> x_strides;
+
+    static utils::Result<ReduceMaxInfo> create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, size_t dim) {
+        auto dtype = y_desc->dtype();
+        if (dtype != x_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32);
+
+        size_t ndim = y_desc->ndim();
+        if (x_desc->ndim() != ndim) {
+            CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE);
+        }
+        CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape());
+        if (ndim > 4) return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        else if (ndim == 0){
+            std::vector<size_t> shape = {1, 1, 1, 1};
+            std::vector<ptrdiff_t> y_strides = {0, 0, 0, 0};
+            std::vector<ptrdiff_t> x_strides = {0, 0, 0, 0};
+            return utils::Result<ReduceMaxInfo>(ReduceMaxInfo{
+                dtype, shape, y_strides, x_strides});
+        }
+        else{
+            std::vector<size_t> shape = x_desc->shape();
+            std::vector<ptrdiff_t> y_strides = y_desc->strides();
+            std::vector<ptrdiff_t> x_strides = x_desc->strides();
+            if (dim != (shape.size() - 1)){
+                std::swap(shape[dim], shape[shape.size() - 1]);
+                std::swap(y_strides[dim], y_strides[shape.size() - 1]);
+                std::swap(x_strides[dim], x_strides[shape.size() - 1]);
+            }
+            while (shape.size() < 4){
+                shape.insert(shape.begin(), 1);
+                y_strides.insert(y_strides.begin(), 0);
+                x_strides.insert(x_strides.begin(), 0);
+            }
+            return utils::Result<ReduceMaxInfo>(ReduceMaxInfo{
+                dtype, shape, y_strides, x_strides});
+        }
+    }
+};
+
+} // namespace op::reduce_max
+
+#endif // __REDUCE_MAX_INFO_H__
diff --git a/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h
new file mode 100644
index 000000000..735bc8da4
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h
@@ -0,0 +1,8 @@
+#ifndef __REDUCE_MAX_METAX_H__
+#define __REDUCE_MAX_METAX_H__
+
+#include "../reduce_max.h"
+
+DESCRIPTOR(metax)
+
+#endif
diff --git a/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca
new file mode 100644
index 000000000..ab120fea5
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca
@@ -0,0 +1,96 @@
+#include "../../../devices/metax/metax_common.h"
+#include "reduce_max_metax.h"
+
+#include <hccub/block/block_reduce.cuh>
+#include "../../../devices/metax/metax_kernel_common.h"
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "../nvidia/kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_METAX_KERNEL ReduceMax(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    ReduceMaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+}
+
+namespace op::reduce_max::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto info = ReduceMaxInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::metax::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t batch_size, size_t channels, size_t height, size_t width,
+                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w,
+                            hcStream_t stream) {
+    dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
+    if (dtype == INFINI_DTYPE_F16) {
+        ReduceMax<BLOCK_SIZE, half, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        ReduceMax<BLOCK_SIZE, __hpcc_bfloat16, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        ReduceMax<BLOCK_SIZE, float, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     void *stream_) const {
+    hcStream_t stream = (hcStream_t)stream_;
+    if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<METAX_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::reduce_max::metax
diff --git a/src/infiniop/ops/reduce_max/nvidia/kernel.cuh b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh
new file mode 100644
index 000000000..51470d75f
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh
@@ -0,0 +1,23 @@
+﻿#ifndef __REDUCE_MAX_KERNEL_CUH__
+#define __REDUCE_MAX_KERNEL_CUH__
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+__device__ void ReduceMaxKernel(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w)  {
+
+    Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h;
+    const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h;
+
+    // [Reduce] Find the max of each updated row and store in shared memory
+    __shared__ Tcompute max_;
+    Tcompute max_0 = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, width, x_stride_w);
+    if (threadIdx.x == 0) {
+        *y = max_0;
+    }
+
+}
+
+#endif // __REDUCE_MAX_KERNEL_CUH__
diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu
new file mode 100644
index 000000000..e95ea69f8
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu
@@ -0,0 +1,99 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "reduce_max_nvidia.cuh"
+
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include <cub/block/block_reduce.cuh>
+
+#include "../../../reduce/cuda/reduce.cuh"
+
+#include "kernel.cuh"
+
+template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
+INFINIOP_CUDA_KERNEL ReduceMax(
+    Tdata *y_, const Tdata *x_,
+    size_t batch, size_t channels, size_t height, size_t width,
+    ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
+    ReduceMaxKernel<BLOCK_SIZE, Tdata, Tcompute>(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+}
+
+namespace op::reduce_max::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+};
+
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    size_t dim) {
+    auto info = ReduceMaxInfo::create(y_desc, x_desc, dim);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::nvidia::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <unsigned int BLOCK_SIZE>
+infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
+                            size_t batch_size, size_t channels, size_t height, size_t width,
+                            ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
+                            ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w,
+                            cudaStream_t stream) {
+    dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
+    if (dtype == INFINI_DTYPE_F16) {
+        ReduceMax<BLOCK_SIZE, half, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_BF16) {
+        ReduceMax<BLOCK_SIZE, __nv_bfloat16, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else if (dtype == INFINI_DTYPE_F32) {
+        ReduceMax<BLOCK_SIZE, float, float>
+            <<<grid, BLOCK_SIZE, 0, stream>>>((float *)y, (const float *)x,
+                                              batch_size, channels, height, width,
+                                              y_stride_b, y_stride_c, y_stride_h,
+                                              x_stride_b, x_stride_c, x_stride_h, x_stride_w);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *y,
+                                     const void *x,
+                                     void *stream_) const {
+    cudaStream_t stream = (cudaStream_t)stream_;
+    if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],
+            _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream));
+    } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
+        CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],
+            _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream));
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::reduce_max::nvidia
diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh
new file mode 100644
index 000000000..388738c27
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __REDUCE_MAX_NVIDIA_H__
+#define __REDUCE_MAX_NVIDIA_H__
+
+#include "../reduce_max.h"
+
+DESCRIPTOR(nvidia)
+
+#endif
diff --git a/src/infiniop/ops/reduce_max/operator.cc b/src/infiniop/ops/reduce_max/operator.cc
new file mode 100644
index 000000000..c22ebeb1b
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/operator.cc
@@ -0,0 +1,181 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/reduce_max.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/reduce_max_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/reduce_max_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/reduce_max_metax.h"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/reduce_max_ascend.h"
+#endif
+
+__C infiniStatus_t infiniopCreateReduceMaxDescriptor(
+    infiniopHandle_t handle,
+    infiniopReduceMaxDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    size_t dim) {
+
+#define CREATE(CASE, NAMESPACE)                                                       \
+    case CASE:                                                                        \
+        return op::reduce_max::NAMESPACE::Descriptor::create(                        \
+            handle,                                                                   \
+            reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor **>(desc_ptr),    \
+            output_desc,                                                              \
+            input_desc,                                                               \
+            dim);
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax)
+#endif
+// #ifdef ENABLE_ASCEND_API
+//         CREATE(INFINI_DEVICE_ASCEND, ascend)
+// #endif
+// #ifdef ENABLE_CAMBRICON_MLU
+//     case DevCambriconMlu: {
+//         return bangCreateReduceMaxDescriptor((BangHandle_t)handle, (ReduceMaxBangDescriptor_t *)desc_ptr, output_desc, input_desc, dim);
+//         // return cnnlCreateReduceMaxDescriptor((BangHandle_t) handle, (ReduceMaxCnnlDescriptor_t *) desc_ptr, output_desc, input_desc, dim);
+//     }
+// #endif
+// #ifdef ENABLE_MTHREADS_GPU
+//     case DevMthreadsGpu: {
+//         return musaCreateReduceMaxDescriptor((MusaHandle_t)handle, (ReduceMaxMusaDescriptor_t *)desc_ptr, output_desc, input_desc, dim);
+//     }
+// #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
+        *size = reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax)
+#endif
+// #ifdef ENABLE_ASCEND_API
+//         GET(INFINI_DEVICE_ASCEND, ascend)
+// #endif
+// #ifdef ENABLE_CAMBRICON_MLU
+//     case DevCambriconMlu: {
+//         return bangGetReduceMaxWorkspaceSize((ReduceMaxBangDescriptor_t)desc, size);
+//         // return cnnlGetReduceMaxWorkspaceSize((ReduceMaxCnnlDescriptor_t) desc, size);
+//     }
+// #endif
+// #ifdef ENABLE_MTHREADS_GPU
+//     case DevMthreadsGpu: {
+//         return musaGetReduceMaxWorkspaceSize((ReduceMaxMusaDescriptor_t)desc, size);
+//     }
+// #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopReduceMax(
+    infiniopReduceMaxDescriptor_t desc,
+    void *workspace, size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                           \
+    case CASE:                                                                               \
+        return reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor *>(desc)->calculate(  \
+            workspace, workspace_size, output, input, stream);
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax)
+#endif
+// #ifdef ENABLE_ASCEND_API
+//         CALCULATE(INFINI_DEVICE_ASCEND, ascend)
+// #endif
+// #ifdef ENABLE_CAMBRICON_MLU
+//     case DevCambriconMlu: {
+//         return bangReduceMax((ReduceMaxBangDescriptor_t)desc, workspace, workspace_size, output, input, stream);
+//         // return cnnlReduceMax((ReduceMaxCnnlDescriptor_t) desc, workspace, workspace_size, output, input, stream);
+//     }
+// #endif
+// #ifdef ENABLE_MTHREADS_GPU
+//     case DevMthreadsGpu: {
+//         return musaReduceMax((ReduceMaxMusaDescriptor_t)desc, workspace, workspace_size, output, input, stream);
+//     }
+// #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc) {
+
+#define DESTROY(CASE, NAMESPACE)                                                  \
+    case CASE:                                                                    \
+        delete reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor *>(desc);  \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DESTROY(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DESTROY(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DESTROY(INFINI_DEVICE_METAX, metax)
+#endif
+// #ifdef ENABLE_ASCEND_API
+//         DESTROY(INFINI_DEVICE_ASCEND, ascend)
+// #endif
+// #ifdef ENABLE_CAMBRICON_MLU
+//     case DevCambriconMlu: {
+//         return bangDestroyReduceMaxDescriptor((ReduceMaxBangDescriptor_t)desc);
+//         // return cnnlDestroyReduceMaxDescriptor((ReduceMaxCnnlDescriptor_t) desc);
+//     }
+// #endif
+// #ifdef ENABLE_MTHREADS_GPU
+//     case DevMthreadsGpu:
+//         return musaDestroyReduceMaxDescriptor((ReduceMaxMusaDescriptor_t)desc);
+// #endif
+    }
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/reduce_max/reduce_max.h b/src/infiniop/ops/reduce_max/reduce_max.h
new file mode 100644
index 000000000..412f67162
--- /dev/null
+++ b/src/infiniop/ops/reduce_max/reduce_max.h
@@ -0,0 +1,47 @@
+#ifndef REDUCE_MAX_H
+#define REDUCE_MAX_H
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::reduce_max::NAMESPACE {                        \
+    class Descriptor final : public InfiniopDescriptor {        \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        ReduceMaxInfo _info;                                     \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            Opaque *opaque,                                      \
+            ReduceMaxInfo info,                                  \
+            size_t workspace_size,                               \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t output_desc,              \
+            infiniopTensorDescriptor_t input_desc,               \
+            size_t dim);                                         \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *output,                                        \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // REDUCE_MAX_H
\ No newline at end of file
diff --git a/src/infiniop/ops/reduce_mean/info.h b/src/infiniop/ops/reduce_mean/info.h
index 428675989..a47c81133 100644
--- a/src/infiniop/ops/reduce_mean/info.h
+++ b/src/infiniop/ops/reduce_mean/info.h
@@ -54,51 +54,6 @@ class ReduceMeanInfo {
             return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
                 dtype, shape, y_strides, x_strides});
         }
-
-        // if(ndim == 3){
-        //     std::vector<size_t> shape = x_desc->shape();
-        //     std::vector<ptrdiff_t> y_strides = y_desc->strides();
-        //     std::vector<ptrdiff_t> x_strides = x_desc->strides();
-        //     if (dim != 2){
-        //         std::swap(shape[dim], shape[2]);
-        //         std::swap(y_strides[dim], y_strides[2]);
-        //         std::swap(x_strides[dim], x_strides[2]);
-        //     }
-        //     return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
-        //         dtype, shape, y_strides, x_strides});
-        // }
-        // else if (ndim == 2){
-        //     std::vector<size_t> shape = x_desc->shape();
-        //     std::vector<ptrdiff_t> y_strides = y_desc->strides();
-        //     std::vector<ptrdiff_t> x_strides = x_desc->strides();
-        //     if (dim != 1){
-        //         std::swap(shape[dim], shape[1]);
-        //         std::swap(y_strides[dim], y_strides[1]);
-        //         std::swap(x_strides[dim], x_strides[1]);
-        //     }
-        //     shape.insert(shape.begin(), 1);
-        //     y_strides.insert(y_strides.begin(), 0);
-        //     x_strides.insert(x_strides.begin(), 0);
-        //     return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
-        //         dtype, shape, y_strides, x_strides});
-        // }
-        // else if (ndim == 1){
-        //     std::vector<size_t> shape = {1, 1, (x_desc->shape())[0]};
-        //     std::vector<ptrdiff_t> y_strides = {0, 0, (y_desc->strides())[0]};
-        //     std::vector<ptrdiff_t> x_strides = {0, 0, (x_desc->strides())[0]};
-        //     return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
-        //         dtype, shape, y_strides, x_strides});
-        // }
-        // else if (ndim == 0){
-        //     std::vector<size_t> shape = {1, 1, 1};
-        //     std::vector<ptrdiff_t> y_strides = {0, 0, 0};
-        //     std::vector<ptrdiff_t> x_strides = {0, 0, 0};
-        //     return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
-        //         dtype, shape, y_strides, x_strides});
-        // }
-        // else{
-        //     return INFINI_STATUS_BAD_TENSOR_SHAPE;
-        // }
     }
 };
 
diff --git a/src/infiniop/reduce/cuda/reduce.cuh b/src/infiniop/reduce/cuda/reduce.cuh
index 0a30a3a26..b69b46163 100644
--- a/src/infiniop/reduce/cuda/reduce.cuh
+++ b/src/infiniop/reduce/cuda/reduce.cuh
@@ -63,10 +63,9 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
 
 // Sum(x) on non-contiguous data of length count
 template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-__device__ __forceinline__ Tcompute sum(
-    const Tdata *data_ptr, 
-    size_t count, 
-    ptrdiff_t stride)
+__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, 
+                                        size_t count, 
+                                        ptrdiff_t stride)
 {
     Tcompute s = 0;
 
@@ -79,6 +78,24 @@ __device__ __forceinline__ Tcompute sum(
 
     return BlockReduce(temp_storage).Sum(s);
 }
+
+// Max(x) on non-contiguous data of length count
+template <unsigned int BLOCK_SIZE, typename Tdata>
+__device__ __forceinline__ Tdata max(const Tdata *data_ptr, 
+                                     size_t count,
+                                     ptrdiff_t stride)
+{
+    Tdata max_ = data_ptr[0];
+
+    for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
+        max_ = cub::Max()(max_, data_ptr[i * stride]);
+    }
+
+    using BlockReduce = cub::BlockReduce<Tdata, BLOCK_SIZE>;
+    __shared__ typename BlockReduce::TempStorage temp_storage;
+
+    return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE);
+}
 } // namespace op::common_cuda::reduce_op
 
 #endif
diff --git a/test/infiniop-test/test_generate/testcases/reduce_max.py b/test/infiniop-test/test_generate/testcases/reduce_max.py
new file mode 100644
index 000000000..487da489a
--- /dev/null
+++ b/test/infiniop-test/test_generate/testcases/reduce_max.py
@@ -0,0 +1,112 @@
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+from enum import Enum, auto
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+
+
+def reduce_max(x, dim):
+    if isinstance(x, np.float64):
+        return x
+    return x.max(axis=dim, keepdims=True)
+
+
+def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray:
+    return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001
+
+
+class ReduceMaxTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        x: np.ndarray,
+        y: np.ndarray,
+        shape_x: List[int] | None,
+        shape_y: List[int] | None,
+        stride_x: List[int] | None,
+        stride_y: List[int] | None,
+        dim: int = 0,
+    ):
+        super().__init__("reduce_max")
+        self.x = x
+        self.y = y
+        self.shape_x=shape_x
+        self.shape_y=shape_y
+        self.stride_x = stride_x
+        self.stride_y = stride_y
+        self.dim = dim
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        print(self.shape_y, self.shape_x, self.stride_y, self.stride_x, self.dim)
+        if self.shape_x is not None:
+            test_writer.add_array(test_writer.gguf_key("input.shape"), self.shape_x)
+        if self.shape_y is not None:
+            test_writer.add_array(test_writer.gguf_key("output.shape"), self.shape_y)
+        if self.stride_x is not None:
+            test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x))
+        test_writer.add_array(
+            test_writer.gguf_key("output.strides"),
+            gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
+        )
+        test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim)
+        test_writer.add_tensor(
+            test_writer.gguf_key("input"),
+            self.x,
+            raw_dtype=np_dtype_to_ggml(self.x.dtype),
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("output"),
+            self.y,
+            raw_dtype=np_dtype_to_ggml(self.y.dtype),
+        )
+        ans = reduce_max(
+            self.x.astype(np.float64), self.dim
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
+        )
+
+
+if __name__ == "__main__":
+    test_writer = InfiniopTestWriter("reduce_max.gguf")
+    test_cases = []
+    # ==============================================================================
+    #  Configuration
+    # ==============================================================================
+    # These are not maxt to be imported from other modules
+    _TEST_CASES_ = [
+    # y_shape, x_shape, y_stride, x_stride, dim
+    # ((0,), (0,), (0,), (0,), 0),
+    ((1, ), (32, ), None, None, 0),
+    ((1, 4), (1, 4), None, None, 0),
+    ((1, 1), (1, 4), None, None, 1),
+    ((16, 1), (16, 2048), None, None, 1),
+    ((1, 16), (2048, 16), None, None, 0),
+    ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1),
+    ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0),
+    ((4, 4, 1), (4, 4, 2048), None, None, 2),
+    ((1, 4, 4), (2048, 4, 4), None, None, 0),
+    ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
+    ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0),
+]
+    _TENSOR_DTYPES_ = [np.float16, np.float32]
+
+    for dtype in _TENSOR_DTYPES_:
+        for shape_y, shape_x, stride_y, stride_x, dim in _TEST_CASES_:
+            x = random_tensor(shape_x, dtype)
+            y = np.empty(tuple(0 for _ in shape_y), dtype=dtype)
+            test_case = ReduceMaxTestCase(
+                x,
+                y,
+                shape_x,
+                shape_y,
+                stride_x,
+                stride_y,
+                dim,
+            )
+            test_cases.append(test_case)
+            
+    test_writer.add_tests(test_cases)
+    test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/reduce_mean.py b/test/infiniop-test/test_generate/testcases/reduce_mean.py
index 0e05d4240..74421d83d 100644
--- a/test/infiniop-test/test_generate/testcases/reduce_mean.py
+++ b/test/infiniop-test/test_generate/testcases/reduce_mean.py
@@ -41,23 +41,23 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
         print(self.shape_y, self.shape_x, self.stride_y, self.stride_x, self.dim)
         if self.shape_x is not None:
-            test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x)
+            test_writer.add_array(test_writer.gguf_key("input.shape"), self.shape_x)
         if self.shape_y is not None:
-            test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y)
+            test_writer.add_array(test_writer.gguf_key("output.shape"), self.shape_y)
         if self.stride_x is not None:
-            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x))
+            test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x))
         test_writer.add_array(
-            test_writer.gguf_key("y.strides"),
+            test_writer.gguf_key("output.strides"),
             gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
         )
         test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim)
         test_writer.add_tensor(
-            test_writer.gguf_key("x"),
+            test_writer.gguf_key("input"),
             self.x,
             raw_dtype=np_dtype_to_ggml(self.x.dtype),
         )
         test_writer.add_tensor(
-            test_writer.gguf_key("y"),
+            test_writer.gguf_key("output"),
             self.y,
             raw_dtype=np_dtype_to_ggml(self.y.dtype),
         )
@@ -89,6 +89,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
     ((4, 4, 1), (4, 4, 2048), None, None, 2),
     ((1, 4, 4), (2048, 4, 4), None, None, 0),
     ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
+    ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0),
 ]
     _TENSOR_DTYPES_ = [np.float16, np.float32]
 
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 7f147a8b2..4e3ad70db 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -520,4 +520,36 @@ def reduce_mean_(lib):
     lib.infiniopDestroyReduceMeanDescriptor.restype = c_int32
     lib.infiniopDestroyReduceMeanDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
+    ]
+
+@OpRegister.operator
+def reduce_max_(lib):
+    lib.infiniopCreateReduceMaxDescriptor.restype = c_int32
+    lib.infiniopCreateReduceMaxDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        c_size_t,
+    ]
+
+    lib.infiniopGetReduceMaxWorkspaceSize.restype = c_int32
+    lib.infiniopGetReduceMaxWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopReduceMax.restype = c_int32
+    lib.infiniopReduceMax.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,
+        c_size_t,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyReduceMaxDescriptor.restype = c_int32
+    lib.infiniopDestroyReduceMaxDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
     ]
\ No newline at end of file
diff --git a/test/infiniop/reduce_max.py b/test/infiniop/reduce_max.py
new file mode 100644
index 000000000..8e27ae484
--- /dev/null
+++ b/test/infiniop/reduce_max.py
@@ -0,0 +1,150 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not maxt to be imported from other modules
+_TEST_CASES_ = [
+    # y_shape, x_shape, y_stride, x_stride, dim
+    ((), (), None, None, 0),
+    ((1, ), (32, ), None, None, 0),
+    ((1, 4), (1, 4), None, None, 0),
+    ((1, 1), (1, 4), None, None, 1),
+    ((16, 1), (16, 2048), None, None, 1),
+    ((1, 16), (2048, 16), None, None, 0),
+    ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1),
+    ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0),
+    ((4, 4, 1), (4, 4, 2048), None, None, 2),
+    ((1, 4, 4), (2048, 4, 4), None, None, 0),
+    ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
+]
+
+# x types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+_TEST_CASES = _TEST_CASES_
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def reduce_max(x, dim):
+    return x.max(dim=dim,keepdim=True)[0]
+
+
+def test(
+    handle,
+    device,
+    y_shape,
+    x_shape,
+    y_stride,
+    x_stride,
+    dim,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing Reduce_Max on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape}"
+        f" y_stride:{y_stride} x_stride:{x_stride} dim:{dim} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    x = TestTensor(x_shape, x_stride, dtype, device)
+    ans = reduce_max(x.torch_tensor(), dim)
+    
+    y = TestTensor(y_shape, y_stride, dtype, device)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateReduceMaxDescriptor(
+            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, ctypes.c_size_t(dim)
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    x.destroy_desc()
+    y.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetReduceMaxWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x.device)
+
+    def lib_reduce_max():
+        check_error(
+            LIBINFINIOP.infiniopReduceMax(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_reduce_max()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: causal_softmax(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_causal_softmax(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyReduceMaxDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    # Execute tests
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/reduce_mean.py b/test/infiniop/reduce_mean.py
index e56663e44..3b752b671 100644
--- a/test/infiniop/reduce_mean.py
+++ b/test/infiniop/reduce_mean.py
@@ -44,9 +44,9 @@
 
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
-    InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2},
-    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
 }
 
 DEBUG = False

From e547f3e5e9f9dfea856879d1f1b5352828b6be25 Mon Sep 17 00:00:00 2001
From: S-hhhhh <2320230838@mail.nankai.edu.cn>
Date: Sun, 17 Aug 2025 10:43:17 +0000
Subject: [PATCH 06/11] fix bugs

---
 .../ops/reduce_max/nvidia/reduce_max_nvidia.cu         | 10 ++++++----
 .../ops/reduce_mean/nvidia/reduce_mean_nvidia.cu       | 10 ++++++----
 2 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu
index e95ea69f8..3aa8e5ff5 100644
--- a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu
+++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu
@@ -84,12 +84,14 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
             _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
-            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],
-            _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream));
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
-            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],
-            _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream));
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else {
         return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
     }
diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
index 1487e080f..26fd4e8ee 100644
--- a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
+++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
@@ -84,12 +84,14 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
             _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
-            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],
-            _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream));
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
-            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2],
-            _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream));
+            y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else {
         return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
     }

From 9324626942a5bff8a5cec1c1e85548002581ecb7 Mon Sep 17 00:00:00 2001
From: S-hhhhh <2320230838@mail.nankai.edu.cn>
Date: Sun, 17 Aug 2025 14:05:12 +0000
Subject: [PATCH 07/11] update

---
 src/infiniop/ops/reduce_mean/nvidia/kernel.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
index 44fb2226e..54914d98a 100644
--- a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
+++ b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
@@ -12,7 +12,6 @@ __device__ void ReduceMeanKernel(
     const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h;
 
     // [Reduce] Find the sum of each updated row and store in shared memory
-    __shared__ Tcompute mean_;
     Tcompute sum_0 = op::common_cuda::reduce_op::sum<BLOCK_SIZE, Tdata, Tcompute>(x, width, x_stride_w);
     if (threadIdx.x == 0) {
         // mean_ = sum_0/width;

From 095bb1873091763f8f215d5c7d353c9e064c20f0 Mon Sep 17 00:00:00 2001
From: S-hhhhh <2320230838@mail.nankai.edu.cn>
Date: Sun, 17 Aug 2025 14:05:56 +0000
Subject: [PATCH 08/11] update

---
 src/infiniop/ops/reduce_max/nvidia/kernel.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/infiniop/ops/reduce_max/nvidia/kernel.cuh b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh
index 51470d75f..864db1047 100644
--- a/src/infiniop/ops/reduce_max/nvidia/kernel.cuh
+++ b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh
@@ -12,7 +12,6 @@ __device__ void ReduceMaxKernel(
     const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h;
 
     // [Reduce] Find the max of each updated row and store in shared memory
-    __shared__ Tcompute max_;
     Tcompute max_0 = op::common_cuda::reduce_op::max<BLOCK_SIZE, Tdata>(x, width, x_stride_w);
     if (threadIdx.x == 0) {
         *y = max_0;

From fb6ff6bf6e92761eed6806a2e37354164d4bb13d Mon Sep 17 00:00:00 2001
From: S-hhhhh <2320230838@mail.nankai.edu.cn>
Date: Sun, 24 Aug 2025 15:31:23 +0800
Subject: [PATCH 09/11] formated

---
 src/infiniop-test/include/ops.hpp             |  31 ++--
 src/infiniop-test/include/test.hpp            |  70 ++++-----
 src/infiniop-test/src/main.cpp                |   7 +-
 src/infiniop-test/src/ops/reduce_max.cpp      |  22 +--
 src/infiniop-test/src/ops/reduce_mean.cpp     |  22 +--
 src/infiniop-test/src/test.cpp                |  11 +-
 .../ops/reduce_max/cpu/reduce_max_cpu.cc      |  22 +--
 src/infiniop/ops/reduce_max/info.h            |  12 +-
 src/infiniop/ops/reduce_max/nvidia/kernel.cuh |   3 +-
 .../reduce_max/nvidia/reduce_max_nvidia.cu    |   8 +-
 src/infiniop/ops/reduce_max/operator.cc       | 136 +++++++++---------
 src/infiniop/ops/reduce_max/reduce_max.h      |   2 +-
 .../ops/reduce_mean/cpu/reduce_mean_cpu.cc    |  22 +--
 src/infiniop/ops/reduce_mean/info.h           |  12 +-
 .../ops/reduce_mean/nvidia/kernel.cuh         |   4 +-
 .../reduce_mean/nvidia/reduce_mean_nvidia.cu  |   8 +-
 src/infiniop/ops/reduce_mean/operator.cc      | 132 ++++++++---------
 src/infiniop/ops/reduce_mean/reduce_mean.h    |   6 +-
 src/infiniop/reduce/cuda/reduce.cuh           |  12 +-
 src/infinirt/infinirt_impl.h                  |   3 +-
 src/utils.h                                   |   2 +-
 src/utils/check.h                             |  32 ++---
 test/infiniop-test/test_generate/__init__.py  |   9 +-
 .../test_generate/testcases/add.py            |  30 ++--
 .../test_generate/testcases/causal_softmax.py |  26 +++-
 .../test_generate/testcases/clip.py           |  87 ++++++-----
 .../test_generate/testcases/mul.py            |  53 ++++---
 .../test_generate/testcases/rearrange.py      |  70 ++++++---
 .../test_generate/testcases/reduce_max.py     |  58 +++++---
 .../test_generate/testcases/reduce_mean.py    |  58 +++++---
 .../test_generate/testcases/rms_norm.py       |  42 ++++--
 .../test_generate/testcases/rope.py           |  44 ++++--
 .../test_generate/testcases/swiglu.py         |  29 +++-
 test/infiniop/libinfiniop/op_register.py      |   5 +-
 test/infiniop/reduce_max.py                   |  12 +-
 test/infiniop/reduce_mean.py                  |  10 +-
 36 files changed, 646 insertions(+), 466 deletions(-)

diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 29e4e633e..bd3a3fd38 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -32,22 +32,21 @@ DECLARE_INFINIOP_TEST(reduce_max)
 /*
  * Register all the tests here
  */
-#define TEST_BUILDER_MAPPINGS                  \
-    {                                          \
-        REGISTER_INFINIOP_TEST(gemm)           \
-        REGISTER_INFINIOP_TEST(random_sample)  \
-        REGISTER_INFINIOP_TEST(add)            \
-        REGISTER_INFINIOP_TEST(mul)            \
-        REGISTER_INFINIOP_TEST(clip)           \
-        REGISTER_INFINIOP_TEST(swiglu)         \
-        REGISTER_INFINIOP_TEST(rope)           \
-        REGISTER_INFINIOP_TEST(rms_norm)       \
-        REGISTER_INFINIOP_TEST(causal_softmax) \
-        REGISTER_INFINIOP_TEST(rearrange)      \
-        REGISTER_INFINIOP_TEST(sub)            \
-        REGISTER_INFINIOP_TEST(reduce_mean)    \
-        REGISTER_INFINIOP_TEST(reduce_max)    \
-    }
+#define TEST_BUILDER_MAPPINGS                                                           \
+    {                                                                                   \
+        REGISTER_INFINIOP_TEST(gemm)                                                    \
+            REGISTER_INFINIOP_TEST(random_sample)                                       \
+                REGISTER_INFINIOP_TEST(add)                                             \
+                    REGISTER_INFINIOP_TEST(mul)                                         \
+                        REGISTER_INFINIOP_TEST(clip)                                    \
+                            REGISTER_INFINIOP_TEST(swiglu)                              \
+                                REGISTER_INFINIOP_TEST(rope)                            \
+                                    REGISTER_INFINIOP_TEST(rms_norm)                    \
+                                        REGISTER_INFINIOP_TEST(causal_softmax)          \
+                                            REGISTER_INFINIOP_TEST(rearrange)           \
+                                                REGISTER_INFINIOP_TEST(sub)             \
+                                                    REGISTER_INFINIOP_TEST(reduce_mean) \
+                                                        REGISTER_INFINIOP_TEST(reduce_max)}
 
 namespace infiniop_test {
 
diff --git a/src/infiniop-test/include/test.hpp b/src/infiniop-test/include/test.hpp
index 58b56b877..277061029 100644
--- a/src/infiniop-test/include/test.hpp
+++ b/src/infiniop-test/include/test.hpp
@@ -47,7 +47,7 @@ std::vector<std::shared_ptr<Result>> runAllTests(
     const GGUFFileReader &,
     infiniDevice_t device, int device_id,
     size_t warm_ups, size_t iterations,
-    double rtol, double atol, bool equal_nan=false);
+    double rtol, double atol, bool equal_nan = false);
 
 // Run a single test read from a GGUF file
 std::shared_ptr<Result> runTest(
@@ -55,8 +55,8 @@ std::shared_ptr<Result> runTest(
     infiniDevice_t device, int device_id,
     size_t warm_ups, size_t iterations,
     double rtol, double atol,
-    size_t test_id, 
-    bool equal_nan=false);
+    size_t test_id,
+    bool equal_nan = false);
 
 // Check if two tensors are close within given tolerance
 void allClose(std::shared_ptr<Tensor> actual, std::shared_ptr<Tensor> expected, double rtol = 1e-3, double atol = 1e-3, bool equal_nan = false);
@@ -82,38 +82,38 @@ class Test {
 } // namespace infiniop_test::base
 
 // Quick macro for declaring a new testcase
-#define DECLARE_INFINIOP_TEST(name)                                                         \
-    namespace infiniop_test::name {                                                         \
-    class Test : public infiniop_test::base::Test {                                         \
-        double _rtol, _atol;                                                                \
-        bool _equal_nan;                                                                    \
-                                                                                            \
-    public:                                                                                 \
-        static std::string op_name() { return #name; }                                      \
-        static std::shared_ptr<Test> build(                                                 \
-            std::unordered_map<std::string, std::vector<uint8_t>> attributes,               \
-            std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,               \
-            double, double, bool);                                                          \
-                                                                                            \
-        static std::vector<std::string> attribute_names();                                  \
-        static std::vector<std::string> tensor_names();                                     \
-        static std::vector<std::string> output_names();                                     \
-                                                                                            \
-        std::shared_ptr<infiniop_test::Result> run(                                         \
-            infiniopHandle_t handle, infiniDevice_t device, int device_id,                  \
-            size_t warm_ups, size_t iterations) override;                                   \
-                                                                                            \
-        std::string toString() const override;                                              \
-                                                                                            \
-        ~Test();                                                                            \
-                                                                                            \
-    private:                                                                                \
-        struct Attributes;                                                                  \
-        Attributes *_attributes;                                                            \
-        Test() = delete;                                                                    \
-        Test(double rtol, double atol, bool equal_nan = false)                              \
-        : _rtol(rtol), _atol(atol), _equal_nan(equal_nan) {}                                \
-    };                                                                                      \
+#define DECLARE_INFINIOP_TEST(name)                                           \
+    namespace infiniop_test::name {                                           \
+    class Test : public infiniop_test::base::Test {                           \
+        double _rtol, _atol;                                                  \
+        bool _equal_nan;                                                      \
+                                                                              \
+    public:                                                                   \
+        static std::string op_name() { return #name; }                        \
+        static std::shared_ptr<Test> build(                                   \
+            std::unordered_map<std::string, std::vector<uint8_t>> attributes, \
+            std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors, \
+            double, double, bool);                                            \
+                                                                              \
+        static std::vector<std::string> attribute_names();                    \
+        static std::vector<std::string> tensor_names();                       \
+        static std::vector<std::string> output_names();                       \
+                                                                              \
+        std::shared_ptr<infiniop_test::Result> run(                           \
+            infiniopHandle_t handle, infiniDevice_t device, int device_id,    \
+            size_t warm_ups, size_t iterations) override;                     \
+                                                                              \
+        std::string toString() const override;                                \
+                                                                              \
+        ~Test();                                                              \
+                                                                              \
+    private:                                                                  \
+        struct Attributes;                                                    \
+        Attributes *_attributes;                                              \
+        Test() = delete;                                                      \
+        Test(double rtol, double atol, bool equal_nan = false)                \
+            : _rtol(rtol), _atol(atol), _equal_nan(equal_nan) {}              \
+    };                                                                        \
     }
 
 namespace infiniop_test {
diff --git a/src/infiniop-test/src/main.cpp b/src/infiniop-test/src/main.cpp
index 9436cd85a..6805bd7f8 100644
--- a/src/infiniop-test/src/main.cpp
+++ b/src/infiniop-test/src/main.cpp
@@ -1,8 +1,8 @@
 #include "gguf.hpp"
 #include "test.hpp"
+#include <cstring>
 #include <infinirt.h>
 #include <iostream>
-#include <cstring>
 struct ParsedArgs {
     std::string file_path;                          // Mandatory argument: test.gguf file path
     infiniDevice_t device_type = INFINI_DEVICE_CPU; // Default to CPU
@@ -96,8 +96,9 @@ ParsedArgs parseArgs(int argc, char *argv[]) {
                 args.rtol = std::stod(argv[++i]);
             }
             else if (arg == "--equal-nan" && i + 1 < argc) {
-                args.equal_nan = (strcmp(argv[++i], "True") == 0 || strcmp(argv[i], "true") == 0) 
-                                                            ? true : false;
+                args.equal_nan = (strcmp(argv[++i], "True") == 0 || strcmp(argv[i], "true") == 0)
+                                   ? true
+                                   : false;
             }
             else {
                 printUsage();
diff --git a/src/infiniop-test/src/ops/reduce_max.cpp b/src/infiniop-test/src/ops/reduce_max.cpp
index 1b10d0e02..37045ec78 100644
--- a/src/infiniop-test/src/ops/reduce_max.cpp
+++ b/src/infiniop-test/src/ops/reduce_max.cpp
@@ -40,9 +40,9 @@ std::shared_ptr<infiniop_test::Result> Test::run(
 
     infiniopReduceMaxDescriptor_t op_desc;
     CHECK_OR(infiniopCreateReduceMaxDescriptor(handle, &op_desc,
-                                             _attributes->output->desc(),
-                                             _attributes->input->desc(),
-                                             _attributes->dim),
+                                               _attributes->output->desc(),
+                                               _attributes->input->desc(),
+                                               _attributes->dim),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed to create ReduceMax descriptor"));
 
     auto input = _attributes->input->to(device, device_id);
@@ -58,10 +58,10 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     }
 
     CHECK_OR(infiniopReduceMax(op_desc,
-                             workspace, workspace_size,
-                             output->data(),
-                             input->data(),
-                             nullptr),
+                               workspace, workspace_size,
+                               output->data(),
+                               input->data(),
+                               nullptr),
              return TEST_FAILED(OP_EXECUTION_FAILED, "ReduceMax execution failed"));
 
     try {
@@ -75,10 +75,10 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     elapsed_time = benchmark(
         [=]() {
             infiniopReduceMax(op_desc,
-                            workspace, workspace_size,
-                            output->data(),
-                            input->data(),
-                            nullptr);
+                              workspace, workspace_size,
+                              output->data(),
+                              input->data(),
+                              nullptr);
         },
         warm_ups, iterations);
 
diff --git a/src/infiniop-test/src/ops/reduce_mean.cpp b/src/infiniop-test/src/ops/reduce_mean.cpp
index 4f1aa4bdd..9de22d49d 100644
--- a/src/infiniop-test/src/ops/reduce_mean.cpp
+++ b/src/infiniop-test/src/ops/reduce_mean.cpp
@@ -40,9 +40,9 @@ std::shared_ptr<infiniop_test::Result> Test::run(
 
     infiniopReduceMeanDescriptor_t op_desc;
     CHECK_OR(infiniopCreateReduceMeanDescriptor(handle, &op_desc,
-                                             _attributes->output->desc(),
-                                             _attributes->input->desc(),
-                                             _attributes->dim),
+                                                _attributes->output->desc(),
+                                                _attributes->input->desc(),
+                                                _attributes->dim),
              return TEST_FAILED(OP_CREATION_FAILED, "Failed to create ReduceMean descriptor"));
 
     auto input = _attributes->input->to(device, device_id);
@@ -58,10 +58,10 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     }
 
     CHECK_OR(infiniopReduceMean(op_desc,
-                             workspace, workspace_size,
-                             output->data(),
-                             input->data(),
-                             nullptr),
+                                workspace, workspace_size,
+                                output->data(),
+                                input->data(),
+                                nullptr),
              return TEST_FAILED(OP_EXECUTION_FAILED, "ReduceMean execution failed"));
 
     try {
@@ -75,10 +75,10 @@ std::shared_ptr<infiniop_test::Result> Test::run(
     elapsed_time = benchmark(
         [=]() {
             infiniopReduceMean(op_desc,
-                            workspace, workspace_size,
-                            output->data(),
-                            input->data(),
-                            nullptr);
+                               workspace, workspace_size,
+                               output->data(),
+                               input->data(),
+                               nullptr);
         },
         warm_ups, iterations);
 
diff --git a/src/infiniop-test/src/test.cpp b/src/infiniop-test/src/test.cpp
index d4c0bab49..0cbfe067a 100644
--- a/src/infiniop-test/src/test.cpp
+++ b/src/infiniop-test/src/test.cpp
@@ -158,15 +158,14 @@ void allClose(std::shared_ptr<Tensor> actual_, std::shared_ptr<Tensor> expected_
     for (size_t i = 0; i < total; i++) {
         double a_ = getVal((char *)actual->data() + actual_offset, actual->ggml_type());
         double e_ = getVal((char *)expected->data() + expected_offset, expected->ggml_type());
-        if (std::isnan(a_) || std::isnan(e_)){
-            if ((equal_nan && (std::isnan(a_) != std::isnan(e_))) || !equal_nan){
-                num_failed ++;
+        if (std::isnan(a_) || std::isnan(e_)) {
+            if ((equal_nan && (std::isnan(a_) != std::isnan(e_))) || !equal_nan) {
+                num_failed++;
                 if (num_failed == 0) {
                     first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + ".";
                 }
             }
-        }
-        else{
+        } else {
             if (std::fabs(a_ - e_) > atol && std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) {
                 if (num_failed == 0) {
                     first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + ".";
@@ -174,7 +173,7 @@ void allClose(std::shared_ptr<Tensor> actual_, std::shared_ptr<Tensor> expected_
                 num_failed++;
             }
         }
-        
+
         incrementOffset(actual_offset, actual->strides(), ggmlTypeSize(actual->ggml_type()),
                         expected_offset, expected->strides(), ggmlTypeSize(expected->ggml_type()),
                         counter, shape);
diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc
index 409a4ca02..5ed28d75e 100644
--- a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc
+++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc
@@ -23,7 +23,7 @@ infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) {
     const size_t batch_size = info->shape[0];
     const size_t channels = info->shape[1];
     const size_t rows = info->shape[2];
-    const size_t cols = info->shape[3];  // 规约维度
+    const size_t cols = info->shape[3]; // 规约维度
 
     const ptrdiff_t y_batch_stride = info->y_strides[0];
     const ptrdiff_t y_channel_stride = info->y_strides[1];
@@ -32,20 +32,20 @@ infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) {
     const ptrdiff_t x_channel_stride = info->x_strides[1];
     const ptrdiff_t x_row_stride = info->x_strides[2];
     const ptrdiff_t x_col_stride = info->x_strides[3];
-    
-    #pragma omp parallel for collapse(3)
+
+#pragma omp parallel for collapse(3)
     for (size_t batch = 0; batch < batch_size; ++batch) {
         for (size_t channel = 0; channel < channels; ++channel) {
             for (size_t row = 0; row < rows; ++row) {
-                const T* input_start = x + batch * x_batch_stride 
-                                    + channel * x_channel_stride 
-                                    + row * x_row_stride;
-                T* output_ptr = y + batch * y_batch_stride 
-                              + channel * y_channel_stride 
+                const T *input_start = x + batch * x_batch_stride
+                                     + channel * x_channel_stride
+                                     + row * x_row_stride;
+                T *output_ptr = y + batch * y_batch_stride
+                              + channel * y_channel_stride
                               + row * y_row_stride;
-                
+
                 float max = op::common_cpu::reduce_op::max(input_start, cols, x_col_stride);
-                
+
                 if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
                     *output_ptr = utils::cast<T>(max);
                 } else {
@@ -54,7 +54,7 @@ infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) {
             }
         }
     }
-    
+
     return INFINI_STATUS_SUCCESS;
 }
 infiniStatus_t Descriptor::calculate(
diff --git a/src/infiniop/ops/reduce_max/info.h b/src/infiniop/ops/reduce_max/info.h
index 0babca6e6..99bb5be19 100644
--- a/src/infiniop/ops/reduce_max/info.h
+++ b/src/infiniop/ops/reduce_max/info.h
@@ -29,24 +29,24 @@ class ReduceMaxInfo {
             CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE);
         }
         CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape());
-        if (ndim > 4) return INFINI_STATUS_BAD_TENSOR_SHAPE;
-        else if (ndim == 0){
+        if (ndim > 4) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        } else if (ndim == 0) {
             std::vector<size_t> shape = {1, 1, 1, 1};
             std::vector<ptrdiff_t> y_strides = {0, 0, 0, 0};
             std::vector<ptrdiff_t> x_strides = {0, 0, 0, 0};
             return utils::Result<ReduceMaxInfo>(ReduceMaxInfo{
                 dtype, shape, y_strides, x_strides});
-        }
-        else{
+        } else {
             std::vector<size_t> shape = x_desc->shape();
             std::vector<ptrdiff_t> y_strides = y_desc->strides();
             std::vector<ptrdiff_t> x_strides = x_desc->strides();
-            if (dim != (shape.size() - 1)){
+            if (dim != (shape.size() - 1)) {
                 std::swap(shape[dim], shape[shape.size() - 1]);
                 std::swap(y_strides[dim], y_strides[shape.size() - 1]);
                 std::swap(x_strides[dim], x_strides[shape.size() - 1]);
             }
-            while (shape.size() < 4){
+            while (shape.size() < 4) {
                 shape.insert(shape.begin(), 1);
                 y_strides.insert(y_strides.begin(), 0);
                 x_strides.insert(x_strides.begin(), 0);
diff --git a/src/infiniop/ops/reduce_max/nvidia/kernel.cuh b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh
index 864db1047..fec2f9341 100644
--- a/src/infiniop/ops/reduce_max/nvidia/kernel.cuh
+++ b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh
@@ -6,7 +6,7 @@ __device__ void ReduceMaxKernel(
     Tdata *y_, const Tdata *x_,
     size_t batch, size_t channels, size_t height, size_t width,
     ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
-    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w)  {
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
 
     Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h;
     const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h;
@@ -16,7 +16,6 @@ __device__ void ReduceMaxKernel(
     if (threadIdx.x == 0) {
         *y = max_0;
     }
-
 }
 
 #endif // __REDUCE_MAX_KERNEL_CUH__
diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu
index 3aa8e5ff5..f64f596f4 100644
--- a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu
+++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu
@@ -47,7 +47,7 @@ infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
                             ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
                             ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w,
                             cudaStream_t stream) {
-    dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
+    dim3 grid = dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
     if (dtype == INFINI_DTYPE_F16) {
         ReduceMax<BLOCK_SIZE, half, float>
             <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
@@ -80,17 +80,17 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
     if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
             y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
-            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
             _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
             y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
-            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
             _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
             y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
-            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
             _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else {
         return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
diff --git a/src/infiniop/ops/reduce_max/operator.cc b/src/infiniop/ops/reduce_max/operator.cc
index c22ebeb1b..fc8a976b6 100644
--- a/src/infiniop/ops/reduce_max/operator.cc
+++ b/src/infiniop/ops/reduce_max/operator.cc
@@ -22,13 +22,13 @@ __C infiniStatus_t infiniopCreateReduceMaxDescriptor(
     infiniopTensorDescriptor_t input_desc,
     size_t dim) {
 
-#define CREATE(CASE, NAMESPACE)                                                       \
-    case CASE:                                                                        \
-        return op::reduce_max::NAMESPACE::Descriptor::create(                        \
-            handle,                                                                   \
-            reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor **>(desc_ptr),    \
-            output_desc,                                                              \
-            input_desc,                                                               \
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        return op::reduce_max::NAMESPACE::Descriptor::create(                     \
+            handle,                                                               \
+            reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                          \
+            input_desc,                                                           \
             dim);
 
     switch (handle->device) {
@@ -44,20 +44,20 @@ __C infiniStatus_t infiniopCreateReduceMaxDescriptor(
 #ifdef ENABLE_METAX_API
         CREATE(INFINI_DEVICE_METAX, metax)
 #endif
-// #ifdef ENABLE_ASCEND_API
-//         CREATE(INFINI_DEVICE_ASCEND, ascend)
-// #endif
-// #ifdef ENABLE_CAMBRICON_MLU
-//     case DevCambriconMlu: {
-//         return bangCreateReduceMaxDescriptor((BangHandle_t)handle, (ReduceMaxBangDescriptor_t *)desc_ptr, output_desc, input_desc, dim);
-//         // return cnnlCreateReduceMaxDescriptor((BangHandle_t) handle, (ReduceMaxCnnlDescriptor_t *) desc_ptr, output_desc, input_desc, dim);
-//     }
-// #endif
-// #ifdef ENABLE_MTHREADS_GPU
-//     case DevMthreadsGpu: {
-//         return musaCreateReduceMaxDescriptor((MusaHandle_t)handle, (ReduceMaxMusaDescriptor_t *)desc_ptr, output_desc, input_desc, dim);
-//     }
-// #endif
+        // #ifdef ENABLE_ASCEND_API
+        //         CREATE(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangCreateReduceMaxDescriptor((BangHandle_t)handle, (ReduceMaxBangDescriptor_t *)desc_ptr, output_desc, input_desc, dim);
+        //         // return cnnlCreateReduceMaxDescriptor((BangHandle_t) handle, (ReduceMaxCnnlDescriptor_t *) desc_ptr, output_desc, input_desc, dim);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaCreateReduceMaxDescriptor((MusaHandle_t)handle, (ReduceMaxMusaDescriptor_t *)desc_ptr, output_desc, input_desc, dim);
+        //     }
+        // #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
@@ -82,20 +82,20 @@ __C infiniStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor
 #ifdef ENABLE_METAX_API
         GET(INFINI_DEVICE_METAX, metax)
 #endif
-// #ifdef ENABLE_ASCEND_API
-//         GET(INFINI_DEVICE_ASCEND, ascend)
-// #endif
-// #ifdef ENABLE_CAMBRICON_MLU
-//     case DevCambriconMlu: {
-//         return bangGetReduceMaxWorkspaceSize((ReduceMaxBangDescriptor_t)desc, size);
-//         // return cnnlGetReduceMaxWorkspaceSize((ReduceMaxCnnlDescriptor_t) desc, size);
-//     }
-// #endif
-// #ifdef ENABLE_MTHREADS_GPU
-//     case DevMthreadsGpu: {
-//         return musaGetReduceMaxWorkspaceSize((ReduceMaxMusaDescriptor_t)desc, size);
-//     }
-// #endif
+        // #ifdef ENABLE_ASCEND_API
+        //         GET(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangGetReduceMaxWorkspaceSize((ReduceMaxBangDescriptor_t)desc, size);
+        //         // return cnnlGetReduceMaxWorkspaceSize((ReduceMaxCnnlDescriptor_t) desc, size);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaGetReduceMaxWorkspaceSize((ReduceMaxMusaDescriptor_t)desc, size);
+        //     }
+        // #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
@@ -107,9 +107,9 @@ __C infiniStatus_t infiniopReduceMax(
     const void *input,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                                           \
-    case CASE:                                                                               \
-        return reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor *>(desc)->calculate(  \
+#define CALCULATE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                             \
+        return reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor *>(desc)->calculate( \
             workspace, workspace_size, output, input, stream);
 
     switch (desc->device_type) {
@@ -125,29 +125,29 @@ __C infiniStatus_t infiniopReduceMax(
 #ifdef ENABLE_METAX_API
         CALCULATE(INFINI_DEVICE_METAX, metax)
 #endif
-// #ifdef ENABLE_ASCEND_API
-//         CALCULATE(INFINI_DEVICE_ASCEND, ascend)
-// #endif
-// #ifdef ENABLE_CAMBRICON_MLU
-//     case DevCambriconMlu: {
-//         return bangReduceMax((ReduceMaxBangDescriptor_t)desc, workspace, workspace_size, output, input, stream);
-//         // return cnnlReduceMax((ReduceMaxCnnlDescriptor_t) desc, workspace, workspace_size, output, input, stream);
-//     }
-// #endif
-// #ifdef ENABLE_MTHREADS_GPU
-//     case DevMthreadsGpu: {
-//         return musaReduceMax((ReduceMaxMusaDescriptor_t)desc, workspace, workspace_size, output, input, stream);
-//     }
-// #endif
+        // #ifdef ENABLE_ASCEND_API
+        //         CALCULATE(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangReduceMax((ReduceMaxBangDescriptor_t)desc, workspace, workspace_size, output, input, stream);
+        //         // return cnnlReduceMax((ReduceMaxCnnlDescriptor_t) desc, workspace, workspace_size, output, input, stream);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaReduceMax((ReduceMaxMusaDescriptor_t)desc, workspace, workspace_size, output, input, stream);
+        //     }
+        // #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
 __C infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc) {
 
-#define DESTROY(CASE, NAMESPACE)                                                  \
-    case CASE:                                                                    \
-        delete reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor *>(desc);  \
+#define DESTROY(CASE, NAMESPACE)                                                \
+    case CASE:                                                                  \
+        delete reinterpret_cast<op::reduce_max::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS;
 
     switch (desc->device_type) {
@@ -163,19 +163,19 @@ __C infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescripto
 #ifdef ENABLE_METAX_API
         DESTROY(INFINI_DEVICE_METAX, metax)
 #endif
-// #ifdef ENABLE_ASCEND_API
-//         DESTROY(INFINI_DEVICE_ASCEND, ascend)
-// #endif
-// #ifdef ENABLE_CAMBRICON_MLU
-//     case DevCambriconMlu: {
-//         return bangDestroyReduceMaxDescriptor((ReduceMaxBangDescriptor_t)desc);
-//         // return cnnlDestroyReduceMaxDescriptor((ReduceMaxCnnlDescriptor_t) desc);
-//     }
-// #endif
-// #ifdef ENABLE_MTHREADS_GPU
-//     case DevMthreadsGpu:
-//         return musaDestroyReduceMaxDescriptor((ReduceMaxMusaDescriptor_t)desc);
-// #endif
+        // #ifdef ENABLE_ASCEND_API
+        //         DESTROY(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangDestroyReduceMaxDescriptor((ReduceMaxBangDescriptor_t)desc);
+        //         // return cnnlDestroyReduceMaxDescriptor((ReduceMaxCnnlDescriptor_t) desc);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu:
+        //         return musaDestroyReduceMaxDescriptor((ReduceMaxMusaDescriptor_t)desc);
+        // #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
\ No newline at end of file
diff --git a/src/infiniop/ops/reduce_max/reduce_max.h b/src/infiniop/ops/reduce_max/reduce_max.h
index 412f67162..6ef8630f2 100644
--- a/src/infiniop/ops/reduce_max/reduce_max.h
+++ b/src/infiniop/ops/reduce_max/reduce_max.h
@@ -7,7 +7,7 @@
 #define DESCRIPTOR(NAMESPACE)                                    \
                                                                  \
     namespace op::reduce_max::NAMESPACE {                        \
-    class Descriptor final : public InfiniopDescriptor {        \
+    class Descriptor final : public InfiniopDescriptor {         \
         struct Opaque;                                           \
         Opaque *_opaque;                                         \
         ReduceMaxInfo _info;                                     \
diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
index b56184ef5..5eb2b5419 100644
--- a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
+++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
@@ -23,7 +23,7 @@ infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) {
     const size_t batch_size = info->shape[0];
     const size_t channels = info->shape[1];
     const size_t rows = info->shape[2];
-    const size_t cols = info->shape[3];  // 规约维度
+    const size_t cols = info->shape[3]; // 规约维度
 
     const ptrdiff_t y_batch_stride = info->y_strides[0];
     const ptrdiff_t y_channel_stride = info->y_strides[1];
@@ -32,20 +32,20 @@ infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) {
     const ptrdiff_t x_channel_stride = info->x_strides[1];
     const ptrdiff_t x_row_stride = info->x_strides[2];
     const ptrdiff_t x_col_stride = info->x_strides[3];
-    
-    #pragma omp parallel for collapse(3)
+
+#pragma omp parallel for collapse(3)
     for (size_t batch = 0; batch < batch_size; ++batch) {
         for (size_t channel = 0; channel < channels; ++channel) {
             for (size_t row = 0; row < rows; ++row) {
-                const T* input_start = x + batch * x_batch_stride 
-                                    + channel * x_channel_stride 
-                                    + row * x_row_stride;
-                T* output_ptr = y + batch * y_batch_stride 
-                              + channel * y_channel_stride 
+                const T *input_start = x + batch * x_batch_stride
+                                     + channel * x_channel_stride
+                                     + row * x_row_stride;
+                T *output_ptr = y + batch * y_batch_stride
+                              + channel * y_channel_stride
                               + row * y_row_stride;
-                
+
                 float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols;
-                
+
                 if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
                     *output_ptr = utils::cast<T>(mean);
                 } else {
@@ -54,7 +54,7 @@ infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) {
             }
         }
     }
-    
+
     return INFINI_STATUS_SUCCESS;
 }
 infiniStatus_t Descriptor::calculate(
diff --git a/src/infiniop/ops/reduce_mean/info.h b/src/infiniop/ops/reduce_mean/info.h
index a47c81133..6c11e07d3 100644
--- a/src/infiniop/ops/reduce_mean/info.h
+++ b/src/infiniop/ops/reduce_mean/info.h
@@ -29,24 +29,24 @@ class ReduceMeanInfo {
             CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE);
         }
         CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape());
-        if (ndim > 4) return INFINI_STATUS_BAD_TENSOR_SHAPE;
-        else if (ndim == 0){
+        if (ndim > 4) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        } else if (ndim == 0) {
             std::vector<size_t> shape = {1, 1, 1, 1};
             std::vector<ptrdiff_t> y_strides = {0, 0, 0, 0};
             std::vector<ptrdiff_t> x_strides = {0, 0, 0, 0};
             return utils::Result<ReduceMeanInfo>(ReduceMeanInfo{
                 dtype, shape, y_strides, x_strides});
-        }
-        else{
+        } else {
             std::vector<size_t> shape = x_desc->shape();
             std::vector<ptrdiff_t> y_strides = y_desc->strides();
             std::vector<ptrdiff_t> x_strides = x_desc->strides();
-            if (dim != (shape.size() - 1)){
+            if (dim != (shape.size() - 1)) {
                 std::swap(shape[dim], shape[shape.size() - 1]);
                 std::swap(y_strides[dim], y_strides[shape.size() - 1]);
                 std::swap(x_strides[dim], x_strides[shape.size() - 1]);
             }
-            while (shape.size() < 4){
+            while (shape.size() < 4) {
                 shape.insert(shape.begin(), 1);
                 y_strides.insert(y_strides.begin(), 0);
                 x_strides.insert(x_strides.begin(), 0);
diff --git a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
index 54914d98a..e70748605 100644
--- a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
+++ b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh
@@ -6,7 +6,7 @@ __device__ void ReduceMeanKernel(
     Tdata *y_, const Tdata *x_,
     size_t batch, size_t channels, size_t height, size_t width,
     ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
-    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w)  {
+    ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) {
 
     Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h;
     const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h;
@@ -15,7 +15,7 @@ __device__ void ReduceMeanKernel(
     Tcompute sum_0 = op::common_cuda::reduce_op::sum<BLOCK_SIZE, Tdata, Tcompute>(x, width, x_stride_w);
     if (threadIdx.x == 0) {
         // mean_ = sum_0/width;
-        *y = sum_0/width;
+        *y = sum_0 / width;
     }
     // __syncthreads();
 
diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
index 26fd4e8ee..bfc26e23d 100644
--- a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
+++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu
@@ -47,7 +47,7 @@ infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype,
                             ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h,
                             ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w,
                             cudaStream_t stream) {
-    dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
+    dim3 grid = dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height));
     if (dtype == INFINI_DTYPE_F16) {
         ReduceMean<BLOCK_SIZE, half, float>
             <<<grid, BLOCK_SIZE, 0, stream>>>((half *)y, (const half *)x,
@@ -80,17 +80,17 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
     if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_1024>(
             y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
-            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
             _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_512>(
             y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
-            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
             _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
         CHECK_STATUS(launchKernel<CUDA_BLOCK_SIZE_4096>(
             y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3],
-            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], 
+            _info.y_strides[0], _info.y_strides[1], _info.y_strides[2],
             _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream));
     } else {
         return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
diff --git a/src/infiniop/ops/reduce_mean/operator.cc b/src/infiniop/ops/reduce_mean/operator.cc
index 32e4e3619..3696f95d2 100644
--- a/src/infiniop/ops/reduce_mean/operator.cc
+++ b/src/infiniop/ops/reduce_mean/operator.cc
@@ -22,13 +22,13 @@ __C infiniStatus_t infiniopCreateReduceMeanDescriptor(
     infiniopTensorDescriptor_t x_desc,
     size_t dim) {
 
-#define CREATE(CASE, NAMESPACE)                                                       \
-    case CASE:                                                                        \
+#define CREATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
         return op::reduce_mean::NAMESPACE::Descriptor::create(                     \
-            handle,                                                                   \
+            handle,                                                                \
             reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor **>(desc_ptr), \
-            y_desc,                                                                   \
-            x_desc,                                                                 \
+            y_desc,                                                                \
+            x_desc,                                                                \
             dim);
 
     switch (handle->device) {
@@ -44,28 +44,28 @@ __C infiniStatus_t infiniopCreateReduceMeanDescriptor(
 #ifdef ENABLE_METAX_API
         CREATE(INFINI_DEVICE_METAX, metax)
 #endif
-// #ifdef ENABLE_ASCEND_API
-//         CREATE(INFINI_DEVICE_ASCEND, ascend)
-// #endif
-// #ifdef ENABLE_CAMBRICON_MLU
-//     case DevCambriconMlu: {
-//         return bangCreateCausalSoftmaxDescriptor((BangHandle_t)handle, (CausalSoftmaxBangDescriptor_t *)desc_ptr, y_desc);
-//         // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
-//     }
-// #endif
-// #ifdef ENABLE_MTHREADS_GPU
-//     case DevMthreadsGpu: {
-//         return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc);
-//     }
-// #endif
+        // #ifdef ENABLE_ASCEND_API
+        //         CREATE(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangCreateCausalSoftmaxDescriptor((BangHandle_t)handle, (CausalSoftmaxBangDescriptor_t *)desc_ptr, y_desc);
+        //         // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc);
+        //     }
+        // #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
 __C infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, size_t *size) {
 
-#define GET(CASE, NAMESPACE)                                                                          \
-    case CASE:                                                                                        \
+#define GET(CASE, NAMESPACE)                                                                       \
+    case CASE:                                                                                     \
         *size = reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
         return INFINI_STATUS_SUCCESS;
 
@@ -82,21 +82,21 @@ __C infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescript
 #ifdef ENABLE_METAX_API
         GET(INFINI_DEVICE_METAX, metax)
 #endif
-// #ifdef ENABLE_ASCEND_API
-//         GET(INFINI_DEVICE_ASCEND, ascend)
-// #endif
-// #ifdef ENABLE_CAMBRICON_MLU
-//     case DevCambriconMlu: {
-//         return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t)desc, size);
-//         // return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size);
-//     }
+        // #ifdef ENABLE_ASCEND_API
+        //         GET(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t)desc, size);
+        //         // return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size);
+        //     }
 
-// #endif
-// #ifdef ENABLE_MTHREADS_GPU
-//     case DevMthreadsGpu: {
-//         return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size);
-//     }
-// #endif
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size);
+        //     }
+        // #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
@@ -108,8 +108,8 @@ __C infiniStatus_t infiniopReduceMean(
     const void *x,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                                             \
-    case CASE:                                                                                 \
+#define CALCULATE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                              \
         return reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor *>(desc)->calculate( \
             workspace, workspace_size, y, x, stream);
 
@@ -126,28 +126,28 @@ __C infiniStatus_t infiniopReduceMean(
 #ifdef ENABLE_METAX_API
         CALCULATE(INFINI_DEVICE_METAX, metax)
 #endif
-// #ifdef ENABLE_ASCEND_API
-//         CALCULATE(INFINI_DEVICE_ASCEND, ascend)
-// #endif
-// #ifdef ENABLE_CAMBRICON_MLU
-//     case DevCambriconMlu: {
-//         return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t)desc, workspace, workspace_size, data, stream);
-//         // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
-//     }
-// #endif
-// #ifdef ENABLE_MTHREADS_GPU
-//     case DevMthreadsGpu: {
-//         return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream);
-//     }
-// #endif
+        // #ifdef ENABLE_ASCEND_API
+        //         CALCULATE(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t)desc, workspace, workspace_size, data, stream);
+        //         // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu: {
+        //         return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream);
+        //     }
+        // #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
 __C infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc) {
 
-#define DESTROY(CASE, NAMESPACE)                                                    \
-    case CASE:                                                                      \
+#define DESTROY(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                   \
         delete reinterpret_cast<op::reduce_mean::NAMESPACE::Descriptor *>(desc); \
         return INFINI_STATUS_SUCCESS;
 
@@ -164,19 +164,19 @@ __C infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescrip
 #ifdef ENABLE_METAX_API
         DESTROY(INFINI_DEVICE_METAX, metax)
 #endif
-// #ifdef ENABLE_ASCEND_API
-//         DESTROY(INFINI_DEVICE_ASCEND, ascend)
-// #endif
-// #ifdef ENABLE_CAMBRICON_MLU
-//     case DevCambriconMlu: {
-//         return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t)desc);
-//         // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
-//     }
-// #endif
-// #ifdef ENABLE_MTHREADS_GPU
-//     case DevMthreadsGpu:
-//         return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc);
-// #endif
+        // #ifdef ENABLE_ASCEND_API
+        //         DESTROY(INFINI_DEVICE_ASCEND, ascend)
+        // #endif
+        // #ifdef ENABLE_CAMBRICON_MLU
+        //     case DevCambriconMlu: {
+        //         return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t)desc);
+        //         // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc);
+        //     }
+        // #endif
+        // #ifdef ENABLE_MTHREADS_GPU
+        //     case DevMthreadsGpu:
+        //         return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc);
+        // #endif
     }
     return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
diff --git a/src/infiniop/ops/reduce_mean/reduce_mean.h b/src/infiniop/ops/reduce_mean/reduce_mean.h
index 58fa566ea..bf2e2dda0 100644
--- a/src/infiniop/ops/reduce_mean/reduce_mean.h
+++ b/src/infiniop/ops/reduce_mean/reduce_mean.h
@@ -6,16 +6,16 @@
 
 #define DESCRIPTOR(NAMESPACE)                                    \
                                                                  \
-    namespace op::reduce_mean::NAMESPACE {                    \
+    namespace op::reduce_mean::NAMESPACE {                       \
     class Descriptor final : public InfiniopDescriptor {         \
         struct Opaque;                                           \
         Opaque *_opaque;                                         \
-        ReduceMeanInfo _info;                                 \
+        ReduceMeanInfo _info;                                    \
         size_t _workspace_size;                                  \
                                                                  \
         Descriptor(                                              \
             Opaque *opaque,                                      \
-            ReduceMeanInfo info,                              \
+            ReduceMeanInfo info,                                 \
             size_t workspace_size,                               \
             infiniDevice_t device_type,                          \
             int device_id)                                       \
diff --git a/src/infiniop/reduce/cuda/reduce.cuh b/src/infiniop/reduce/cuda/reduce.cuh
index b69b46163..cd98d0936 100644
--- a/src/infiniop/reduce/cuda/reduce.cuh
+++ b/src/infiniop/reduce/cuda/reduce.cuh
@@ -63,10 +63,9 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) {
 
 // Sum(x) on non-contiguous data of length count
 template <unsigned int BLOCK_SIZE, typename Tdata, typename Tcompute>
-__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, 
-                                        size_t count, 
-                                        ptrdiff_t stride)
-{
+__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr,
+                                        size_t count,
+                                        ptrdiff_t stride) {
     Tcompute s = 0;
 
     for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
@@ -81,10 +80,9 @@ __device__ __forceinline__ Tcompute sum(const Tdata *data_ptr,
 
 // Max(x) on non-contiguous data of length count
 template <unsigned int BLOCK_SIZE, typename Tdata>
-__device__ __forceinline__ Tdata max(const Tdata *data_ptr, 
+__device__ __forceinline__ Tdata max(const Tdata *data_ptr,
                                      size_t count,
-                                     ptrdiff_t stride)
-{
+                                     ptrdiff_t stride) {
     Tdata max_ = data_ptr[0];
 
     for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) {
diff --git a/src/infinirt/infinirt_impl.h b/src/infinirt/infinirt_impl.h
index 0d6f8cf05..4c41a1198 100644
--- a/src/infinirt/infinirt_impl.h
+++ b/src/infinirt/infinirt_impl.h
@@ -30,7 +30,6 @@
     infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) IMPL;
 
 #define INFINIRT_DEVICE_API_IMPL INFINIRT_DEVICE_API(, )
-#define INFINIRT_DEVICE_API_NOOP INFINIRT_DEVICE_API({ return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; }, \
-                                                     {*count = 0; return INFINI_STATUS_SUCCESS; })
+#define INFINIRT_DEVICE_API_NOOP INFINIRT_DEVICE_API({ return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; }, {*count = 0; return INFINI_STATUS_SUCCESS; })
 
 #endif // __INFINIRT_IMPL_H__
diff --git a/src/utils.h b/src/utils.h
index f4e63be25..af222baca 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -98,7 +98,7 @@ inline std::string infiniDtypeToString(infiniDtype_t dtype) {
     }
 }
 
-#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
+#define CEIL_DIV(x, y) (((x) + (y) - 1) / (y))
 
 namespace utils {
 
diff --git a/src/utils/check.h b/src/utils/check.h
index 72a0995fd..76823aa40 100644
--- a/src/utils/check.h
+++ b/src/utils/check.h
@@ -59,22 +59,22 @@
 
 #define CHECK_SAME_STRIDES(FIRST, ...) CHECK_SAME_VEC(INFINI_STATUS_BAD_TENSOR_STRIDES, FIRST, __VA_ARGS__)
 
-#define CHECK_REDUCE_SHAPE(INPUT_SHAPE, DIM, EXPECTED_SHAPE)     \
-    do {                                                         \
-        if (INPUT_SHAPE.empty()) {                               \
-            if (!EXPECTED_SHAPE.empty()) {                       \
-                return INFINI_STATUS_BAD_TENSOR_SHAPE;           \
-            }                                                    \
-            break;                                               \
-        }                                                        \
-        if (DIM >= INPUT_SHAPE.size()) {                         \
-            return INFINI_STATUS_BAD_PARAM;                      \
-        }                                                        \
-        std::vector<size_t> reduced_shape = INPUT_SHAPE;         \
-        reduced_shape[DIM] = 1;                                  \
-        if (reduced_shape != EXPECTED_SHAPE) {                   \
-            return INFINI_STATUS_BAD_TENSOR_SHAPE;               \
-        }                                                        \
+#define CHECK_REDUCE_SHAPE(INPUT_SHAPE, DIM, EXPECTED_SHAPE) \
+    do {                                                     \
+        if (INPUT_SHAPE.empty()) {                           \
+            if (!EXPECTED_SHAPE.empty()) {                   \
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;       \
+            }                                                \
+            break;                                           \
+        }                                                    \
+        if (DIM >= INPUT_SHAPE.size()) {                     \
+            return INFINI_STATUS_BAD_PARAM;                  \
+        }                                                    \
+        std::vector<size_t> reduced_shape = INPUT_SHAPE;     \
+        reduced_shape[DIM] = 1;                              \
+        if (reduced_shape != EXPECTED_SHAPE) {               \
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;           \
+        }                                                    \
     } while (0)
 
 #endif // INFINIUTILS_CHECK_H
diff --git a/test/infiniop-test/test_generate/__init__.py b/test/infiniop-test/test_generate/__init__.py
index a61f63f7c..8db1e6755 100644
--- a/test/infiniop-test/test_generate/__init__.py
+++ b/test/infiniop-test/test_generate/__init__.py
@@ -1 +1,8 @@
-from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+from .infiniop_test import (
+    InfiniopTestCase,
+    InfiniopTestWriter,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+    process_zero_stride_tensor,
+)
diff --git a/test/infiniop-test/test_generate/testcases/add.py b/test/infiniop-test/test_generate/testcases/add.py
index b04ba2042..052ef18a7 100644
--- a/test/infiniop-test/test_generate/testcases/add.py
+++ b/test/infiniop-test/test_generate/testcases/add.py
@@ -4,7 +4,14 @@
 from typing import List
 from numpy.lib.stride_tricks import as_strided
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+    process_zero_stride_tensor,
+)
 
 
 def add(
@@ -26,7 +33,6 @@ def __init__(
         c: np.ndarray,
         shape_c: List[int] | None,
         stride_c: List[int] | None,
-
     ):
         super().__init__("add")
         self.a = a
@@ -39,7 +45,6 @@ def __init__(
         self.shape_c = shape_c
         self.stride_c = stride_c
 
-
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
         if self.shape_a is not None:
@@ -49,12 +54,22 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         if self.shape_c is not None:
             test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
         if self.stride_a is not None:
-            test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
+            test_writer.add_array(
+                test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)
+            )
         if self.stride_b is not None:
-            test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
+            test_writer.add_array(
+                test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)
+            )
         test_writer.add_array(
             test_writer.gguf_key("c.strides"),
-            gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
+            gguf_strides(
+                *(
+                    self.stride_c
+                    if self.stride_c is not None
+                    else contiguous_gguf_strides(self.shape_c)
+                )
+            ),
         )
         test_writer.add_tensor(
             test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
@@ -114,7 +129,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                 stride_c=stride_c,
             )
             test_cases.append(test_case)
-            
+
     test_writer.add_tests(test_cases)
     test_writer.save()
-    
\ No newline at end of file
diff --git a/test/infiniop-test/test_generate/testcases/causal_softmax.py b/test/infiniop-test/test_generate/testcases/causal_softmax.py
index 74c3efcf0..037701865 100644
--- a/test/infiniop-test/test_generate/testcases/causal_softmax.py
+++ b/test/infiniop-test/test_generate/testcases/causal_softmax.py
@@ -4,7 +4,13 @@
 from typing import List
 from enum import Enum, auto
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
 
 
 def causal_softmax(x):
@@ -37,8 +43,8 @@ def __init__(
         super().__init__("causal_softmax")
         self.x = x
         self.y = y
-        self.shape_x=shape_x
-        self.shape_y=shape_y
+        self.shape_x = shape_x
+        self.shape_y = shape_y
         self.stride_x = stride_x
         self.stride_y = stride_y
 
@@ -49,10 +55,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         if self.shape_y is not None:
             test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y)
         if self.stride_x is not None:
-            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x))
+            test_writer.add_array(
+                test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)
+            )
         test_writer.add_array(
             test_writer.gguf_key("y.strides"),
-            gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
+            gguf_strides(
+                *(
+                    self.stride_y
+                    if self.stride_y is not None
+                    else contiguous_gguf_strides(self.shape_y)
+                )
+            ),
         )
         test_writer.add_tensor(
             test_writer.gguf_key("x"),
@@ -102,6 +116,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                 stride_y,
             )
             test_cases.append(test_case)
-            
+
     test_writer.add_tests(test_cases)
     test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/clip.py b/test/infiniop-test/test_generate/testcases/clip.py
index f08a59929..786153197 100644
--- a/test/infiniop-test/test_generate/testcases/clip.py
+++ b/test/infiniop-test/test_generate/testcases/clip.py
@@ -2,7 +2,13 @@
 import gguf
 from typing import List, Optional, Tuple
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
 
 
 def clip(
@@ -35,7 +41,7 @@ def random_tensor(shape, dtype):
     Returns:
         Random tensor with the specified shape and dtype
     """
-    return (np.random.rand(*shape).astype(dtype) * 4.0 - 2.0)
+    return np.random.rand(*shape).astype(dtype) * 4.0 - 2.0
 
 
 class ClipTestCase(InfiniopTestCase):
@@ -52,7 +58,7 @@ def __init__(
         max_val: np.ndarray,
         max_stride: Optional[List[int]],
         y: np.ndarray,
-        y_shape:  Optional[List[int]],
+        y_shape: Optional[List[int]],
         y_stride: Optional[List[int]],
     ):
         super().__init__("clip")
@@ -63,7 +69,7 @@ def __init__(
         self.max_val = max_val
         self.max_stride = max_stride
         self.y = y
-        self.y_shape=y_shape
+        self.y_shape = y_shape
         self.y_stride = y_stride
 
     def write_test(self, test_writer: "InfiniopTestWriter"):
@@ -71,57 +77,64 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
 
         # Add strides as arrays if they exist
         if self.x_stride is not None:
-            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_stride))
+            test_writer.add_array(
+                test_writer.gguf_key("x.strides"), gguf_strides(*self.x_stride)
+            )
         if self.min_stride is not None:
-            test_writer.add_array(test_writer.gguf_key("min_val.strides"), gguf_strides(*self.min_stride))
+            test_writer.add_array(
+                test_writer.gguf_key("min_val.strides"), gguf_strides(*self.min_stride)
+            )
         if self.max_stride is not None:
-            test_writer.add_array(test_writer.gguf_key("max_val.strides"), gguf_strides(*self.max_stride))
+            test_writer.add_array(
+                test_writer.gguf_key("max_val.strides"), gguf_strides(*self.max_stride)
+            )
         if self.y_shape is not None:
             test_writer.add_array(test_writer.gguf_key("y.shape"), self.y_shape)
         test_writer.add_array(
             test_writer.gguf_key("y.strides"),
-            gguf_strides(*self.y_stride if self.y_stride is not None else contiguous_gguf_strides(self.y_shape))
+            gguf_strides(
+                *(
+                    self.y_stride
+                    if self.y_stride is not None
+                    else contiguous_gguf_strides(self.y_shape)
+                )
+            ),
         )
 
         # Add tensors to the test
         test_writer.add_tensor(
-            test_writer.gguf_key("x"),
-            self.x,
-            raw_dtype=np_dtype_to_ggml(self.x.dtype)
+            test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype)
         )
 
         test_writer.add_tensor(
             test_writer.gguf_key("min_val"),
             self.min_val,
-            raw_dtype=np_dtype_to_ggml(self.min_val.dtype)
+            raw_dtype=np_dtype_to_ggml(self.min_val.dtype),
         )
 
         test_writer.add_tensor(
             test_writer.gguf_key("max_val"),
             self.max_val,
-            raw_dtype=np_dtype_to_ggml(self.max_val.dtype)
+            raw_dtype=np_dtype_to_ggml(self.max_val.dtype),
         )
 
         test_writer.add_tensor(
-            test_writer.gguf_key("y"),
-            self.y,
-            raw_dtype=np_dtype_to_ggml(self.y.dtype)
+            test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype)
         )
 
         # Calculate the expected result
         ans = clip(
             self.x.astype(np.float64),
             self.min_val.astype(np.float64),
-            self.max_val.astype(np.float64)
+            self.max_val.astype(np.float64),
         )
 
         # Add the expected result to the test
         test_writer.add_tensor(
-            test_writer.gguf_key("ans"),
-            ans,
-            raw_dtype=gguf.GGMLQuantizationType.F64
+            test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
         )
 
+
 if __name__ == "__main__":
     test_writer = InfiniopTestWriter("clip.gguf")
 
@@ -130,23 +143,23 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
 
     # Test case shapes
     shapes = [
-        (10,),                # 1D tensor
-        (5, 10),              # 2D tensor
-        (2, 3, 4),            # 3D tensor
-        (7, 13),              # Prime dimensions
-        (1, 1),               # Minimum shape
-        (100, 100),           # Large shape
-        (16, 16, 16),         # Large 3D
+        (10,),  # 1D tensor
+        (5, 10),  # 2D tensor
+        (2, 3, 4),  # 3D tensor
+        (7, 13),  # Prime dimensions
+        (1, 1),  # Minimum shape
+        (100, 100),  # Large shape
+        (16, 16, 16),  # Large 3D
     ]
 
     # Test case min/max values
     min_max_values = [
-        (-1.0, 1.0),          # Standard range
-        (0.0, 2.0),           # Positive range
-        (-2.0, 0.0),          # Negative range
-        (-1000.0, 1000.0),    # Large range
-        (-0.001, 0.001),      # Small range
-        (0.0, 0.0),           # min=max
+        (-1.0, 1.0),  # Standard range
+        (0.0, 2.0),  # Positive range
+        (-2.0, 0.0),  # Negative range
+        (-1000.0, 1000.0),  # Large range
+        (-0.001, 0.001),  # Small range
+        (0.0, 0.0),  # min=max
     ]
 
     # Data types to test
@@ -171,7 +184,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                         max_stride=None,
                         y=y,
                         y_shape=shape,
-                        y_stride=None
+                        y_stride=None,
                     )
                 )
 
@@ -199,7 +212,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                     max_stride=row_stride,
                     y=y,
                     y_shape=shape,
-                    y_stride=row_stride
+                    y_stride=row_stride,
                 )
             )
 
@@ -219,7 +232,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                     max_stride=col_stride,
                     y=y,
                     y_shape=shape,
-                    y_stride=col_stride
+                    y_stride=col_stride,
                 )
             )
 
@@ -239,7 +252,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                     max_stride=row_stride,
                     y=y,
                     y_shape=shape,
-                    y_stride=col_stride
+                    y_stride=col_stride,
                 )
             )
 
diff --git a/test/infiniop-test/test_generate/testcases/mul.py b/test/infiniop-test/test_generate/testcases/mul.py
index 00c427bcb..ad4f6b806 100644
--- a/test/infiniop-test/test_generate/testcases/mul.py
+++ b/test/infiniop-test/test_generate/testcases/mul.py
@@ -2,30 +2,36 @@
 import gguf
 from typing import List
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
 
-def mul(
-    a: np.ndarray,
-    b: np.ndarray
-):
+
+def mul(a: np.ndarray, b: np.ndarray):
     return np.multiply(a, b)
 
+
 def random_tensor(shape, dtype):
     rate = 1e-3
     var = 0.5 * rate  # 数值范围在[-5e-4, 5e-4]
     return rate * np.random.rand(*shape).astype(dtype) - var
 
+
 class MulTestCase(InfiniopTestCase):
     def __init__(
         self,
         a: np.ndarray,
-        shape_a: List[int] | None,        
+        shape_a: List[int] | None,
         stride_a: List[int] | None,
         b: np.ndarray,
-        shape_b: List[int] | None,       
+        shape_b: List[int] | None,
         stride_b: List[int] | None,
         c: np.ndarray,
-        shape_c: List[int] | None,    
+        shape_c: List[int] | None,
         stride_c: List[int] | None,
     ):
         super().__init__("mul")
@@ -39,7 +45,6 @@ def __init__(
         self.shape_c = shape_c
         self.stride_c = stride_c
 
-
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
         if self.shape_a is not None:
@@ -49,12 +54,22 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         if self.shape_c is not None:
             test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
         if self.stride_a is not None:
-            test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
+            test_writer.add_array(
+                test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)
+            )
         if self.stride_b is not None:
-            test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
+            test_writer.add_array(
+                test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)
+            )
         test_writer.add_array(
             test_writer.gguf_key("c.strides"),
-            gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
+            gguf_strides(
+                *(
+                    self.stride_c
+                    if self.stride_c is not None
+                    else contiguous_gguf_strides(self.shape_c)
+                )
+            ),
         )
 
         test_writer.add_tensor(
@@ -68,7 +83,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         )
         a_fp64 = self.a.astype(np.float64)
         b_fp64 = self.b.astype(np.float64)
-        
+
         ans_fp64 = np.multiply(a_fp64, b_fp64)
         ans = mul(self.a, self.b)
         test_writer.add_tensor(
@@ -80,7 +95,8 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             raw_dtype=np_dtype_to_ggml(ans_fp64.dtype),
         )
 
-if __name__ == '__main__':
+
+if __name__ == "__main__":
     test_writer = InfiniopTestWriter("mul.gguf")
     test_cases = []
 
@@ -96,16 +112,15 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         ((2048, 2560), (2560, 1), (1, 2048), (2560, 1)),
         ((4, 48, 64), (64 * 48, 64, 1), (1, 4, 192), None),
         ((4, 48, 64), None, (1, 4, 192), (48 * 64, 64, 1)),
-    ]   
+    ]
     _TENSOR_DTYPES_ = [np.float32, np.float16]
-    
+
     for dtype in _TENSOR_DTYPES_:
         for shape, stride_a, stride_b, stride_c in _TEST_CASES_:
             a = random_tensor(shape, dtype)
             b = random_tensor(shape, dtype)
             c = np.empty(tuple(0 for _ in shape), dtype=dtype)
 
-                
             test_cases.append(
                 MulTestCase(
                     a=a,
@@ -118,7 +133,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                     shape_c=shape,
                     stride_c=stride_c,
                 )
-            )   
-    
+            )
+
     test_writer.add_tests(test_cases)
     test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/rearrange.py b/test/infiniop-test/test_generate/testcases/rearrange.py
index 9617a1fc0..3d3a0e73b 100644
--- a/test/infiniop-test/test_generate/testcases/rearrange.py
+++ b/test/infiniop-test/test_generate/testcases/rearrange.py
@@ -1,14 +1,21 @@
 import torch
 from typing import List
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
+
 
 def row_major_strides(shape):
     """生成张量的行优先stride
-    
+
     Args:
         shape: 张量形状
-    
+
     Returns:
         行优先strides列表
     """
@@ -19,12 +26,13 @@ def row_major_strides(shape):
         strides.insert(0, stride)
     return strides
 
+
 def column_major_strides(shape):
     """生成张量的列优先stride
-    
+
     Args:
         shape: 张量形状
-    
+
     Returns:
         列优先strides列表
     """
@@ -35,6 +43,7 @@ def column_major_strides(shape):
         strides.append(stride)
     return strides
 
+
 def rearrange_using_torch(src: torch.Tensor, dst_strides: List[int]) -> torch.Tensor:
     """
     使用torch的rearrange函数计算结果
@@ -66,27 +75,35 @@ def __init__(
         self.shape = shape
         self.src_strides = src_strides
         self.dst_strides = dst_strides
-        
+
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
-        
+
         # 写入形状信息
         if self.shape is not None:
             test_writer.add_array(test_writer.gguf_key("src.shape"), self.shape)
             test_writer.add_array(test_writer.gguf_key("dst.shape"), self.shape)
-        
+
         # 写入strides信息
         if self.src_strides is not None:
-            test_writer.add_array(test_writer.gguf_key("src.strides"), gguf_strides(*self.src_strides))
+            test_writer.add_array(
+                test_writer.gguf_key("src.strides"), gguf_strides(*self.src_strides)
+            )
         test_writer.add_array(
             test_writer.gguf_key("dst.strides"),
-            gguf_strides(*self.dst_strides if self.dst_strides is not None else contiguous_gguf_strides(self.shape))
+            gguf_strides(
+                *(
+                    self.dst_strides
+                    if self.dst_strides is not None
+                    else contiguous_gguf_strides(self.shape)
+                )
+            ),
         )
-        
+
         # 转换torch tensor为numpy用于写入文件
         src_numpy = self.src.detach().cpu().numpy()
         dst_numpy = self.dst.detach().cpu().numpy()
-        
+
         # 写入张量数据
         test_writer.add_tensor(
             test_writer.gguf_key("src"),
@@ -98,9 +115,13 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             dst_numpy,
             raw_dtype=np_dtype_to_ggml(dst_numpy.dtype),
         )
-        
+
         # 计算并写入答案
-        dst_strides_for_ans = self.dst_strides if self.dst_strides is not None else list(contiguous_gguf_strides(self.shape))
+        dst_strides_for_ans = (
+            self.dst_strides
+            if self.dst_strides is not None
+            else list(contiguous_gguf_strides(self.shape))
+        )
         ans_torch = rearrange_using_torch(self.src, dst_strides_for_ans)
         ans_numpy = ans_torch.detach().cpu().numpy()
         test_writer.add_tensor(
@@ -109,6 +130,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             raw_dtype=np_dtype_to_ggml(src_numpy.dtype),
         )
 
+
 if __name__ == "__main__":
     test_writer = InfiniopTestWriter("rearrange.gguf")
     test_cases = []
@@ -117,12 +139,20 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         # (shape, src_stride, dst_stride)
         ((100, 100), (1, 100), (100, 1)),
         ((4, 4), (1, 4), (4, 1)),
-        ((4, 6, 64), (64, 4*64, 1), (6*64, 64, 1)),
+        ((4, 6, 64), (64, 4 * 64, 1), (6 * 64, 64, 1)),
         ((2000, 2000), (1, 2000), (2000, 1)),
         ((2001, 2001), (1, 2001), (2001, 1)),
         ((2, 2, 2, 4), (16, 8, 4, 1), (16, 8, 1, 2)),
-        ((3, 4, 7, 53, 9), row_major_strides((3, 4, 7, 53, 9)), column_major_strides((3, 4, 7, 53, 9))),
-        ((3, 4, 50, 50, 5, 7), row_major_strides((3, 4, 50, 50, 5, 7)), column_major_strides((3, 4, 50, 50, 5, 7))),
+        (
+            (3, 4, 7, 53, 9),
+            row_major_strides((3, 4, 7, 53, 9)),
+            column_major_strides((3, 4, 7, 53, 9)),
+        ),
+        (
+            (3, 4, 50, 50, 5, 7),
+            row_major_strides((3, 4, 50, 50, 5, 7)),
+            column_major_strides((3, 4, 50, 50, 5, 7)),
+        ),
     ]
 
     _TENSOR_DTYPES_ = [torch.float32, torch.float16]
@@ -132,7 +162,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             src = torch.rand(*shape, dtype=dtype)
             # 生成目标张量，使用正确的形状
             dst = torch.empty(shape, dtype=dtype)
-            
+
             test_case = RearrangeTestCase(
                 src=src,
                 dst=dst,
@@ -140,7 +170,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                 src_strides=src_strides,
                 dst_strides=dst_strides,
             )
-            test_cases.append(test_case)        
+            test_cases.append(test_case)
 
     test_writer.add_tests(test_cases)
-    test_writer.save() 
+    test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/reduce_max.py b/test/infiniop-test/test_generate/testcases/reduce_max.py
index 487da489a..868b8b3c4 100644
--- a/test/infiniop-test/test_generate/testcases/reduce_max.py
+++ b/test/infiniop-test/test_generate/testcases/reduce_max.py
@@ -4,7 +4,13 @@
 from typing import List
 from enum import Enum, auto
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
 
 
 def reduce_max(x, dim):
@@ -31,8 +37,8 @@ def __init__(
         super().__init__("reduce_max")
         self.x = x
         self.y = y
-        self.shape_x=shape_x
-        self.shape_y=shape_y
+        self.shape_x = shape_x
+        self.shape_y = shape_y
         self.stride_x = stride_x
         self.stride_y = stride_y
         self.dim = dim
@@ -45,10 +51,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         if self.shape_y is not None:
             test_writer.add_array(test_writer.gguf_key("output.shape"), self.shape_y)
         if self.stride_x is not None:
-            test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x))
+            test_writer.add_array(
+                test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x)
+            )
         test_writer.add_array(
             test_writer.gguf_key("output.strides"),
-            gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
+            gguf_strides(
+                *(
+                    self.stride_y
+                    if self.stride_y is not None
+                    else contiguous_gguf_strides(self.shape_y)
+                )
+            ),
         )
         test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim)
         test_writer.add_tensor(
@@ -61,9 +75,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             self.y,
             raw_dtype=np_dtype_to_ggml(self.y.dtype),
         )
-        ans = reduce_max(
-            self.x.astype(np.float64), self.dim
-        )
+        ans = reduce_max(self.x.astype(np.float64), self.dim)
         test_writer.add_tensor(
             test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
         )
@@ -77,20 +89,20 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
     # ==============================================================================
     # These are not maxt to be imported from other modules
     _TEST_CASES_ = [
-    # y_shape, x_shape, y_stride, x_stride, dim
-    # ((0,), (0,), (0,), (0,), 0),
-    ((1, ), (32, ), None, None, 0),
-    ((1, 4), (1, 4), None, None, 0),
-    ((1, 1), (1, 4), None, None, 1),
-    ((16, 1), (16, 2048), None, None, 1),
-    ((1, 16), (2048, 16), None, None, 0),
-    ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1),
-    ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0),
-    ((4, 4, 1), (4, 4, 2048), None, None, 2),
-    ((1, 4, 4), (2048, 4, 4), None, None, 0),
-    ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
-    ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0),
-]
+        # y_shape, x_shape, y_stride, x_stride, dim
+        # ((0,), (0,), (0,), (0,), 0),
+        ((1,), (32,), None, None, 0),
+        ((1, 4), (1, 4), None, None, 0),
+        ((1, 1), (1, 4), None, None, 1),
+        ((16, 1), (16, 2048), None, None, 1),
+        ((1, 16), (2048, 16), None, None, 0),
+        ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1),
+        ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0),
+        ((4, 4, 1), (4, 4, 2048), None, None, 2),
+        ((1, 4, 4), (2048, 4, 4), None, None, 0),
+        ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
+        ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0),
+    ]
     _TENSOR_DTYPES_ = [np.float16, np.float32]
 
     for dtype in _TENSOR_DTYPES_:
@@ -107,6 +119,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                 dim,
             )
             test_cases.append(test_case)
-            
+
     test_writer.add_tests(test_cases)
     test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/reduce_mean.py b/test/infiniop-test/test_generate/testcases/reduce_mean.py
index 74421d83d..a452a8a89 100644
--- a/test/infiniop-test/test_generate/testcases/reduce_mean.py
+++ b/test/infiniop-test/test_generate/testcases/reduce_mean.py
@@ -4,7 +4,13 @@
 from typing import List
 from enum import Enum, auto
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
 
 
 def reduce_mean(x, dim):
@@ -31,8 +37,8 @@ def __init__(
         super().__init__("reduce_mean")
         self.x = x
         self.y = y
-        self.shape_x=shape_x
-        self.shape_y=shape_y
+        self.shape_x = shape_x
+        self.shape_y = shape_y
         self.stride_x = stride_x
         self.stride_y = stride_y
         self.dim = dim
@@ -45,10 +51,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         if self.shape_y is not None:
             test_writer.add_array(test_writer.gguf_key("output.shape"), self.shape_y)
         if self.stride_x is not None:
-            test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x))
+            test_writer.add_array(
+                test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x)
+            )
         test_writer.add_array(
             test_writer.gguf_key("output.strides"),
-            gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
+            gguf_strides(
+                *(
+                    self.stride_y
+                    if self.stride_y is not None
+                    else contiguous_gguf_strides(self.shape_y)
+                )
+            ),
         )
         test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim)
         test_writer.add_tensor(
@@ -61,9 +75,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             self.y,
             raw_dtype=np_dtype_to_ggml(self.y.dtype),
         )
-        ans = reduce_mean(
-            self.x.astype(np.float64), self.dim
-        )
+        ans = reduce_mean(self.x.astype(np.float64), self.dim)
         test_writer.add_tensor(
             test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64
         )
@@ -77,20 +89,20 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
     # ==============================================================================
     # These are not meant to be imported from other modules
     _TEST_CASES_ = [
-    # y_shape, x_shape, y_stride, x_stride, dim
-    # ((0,), (0,), (0,), (0,), 0),
-    ((1, ), (32, ), None, None, 0),
-    ((1, 4), (1, 4), None, None, 0),
-    ((1, 1), (1, 4), None, None, 1),
-    ((16, 1), (16, 2048), None, None, 1),
-    ((1, 16), (2048, 16), None, None, 0),
-    ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1),
-    ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0),
-    ((4, 4, 1), (4, 4, 2048), None, None, 2),
-    ((1, 4, 4), (2048, 4, 4), None, None, 0),
-    ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
-    ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0),
-]
+        # y_shape, x_shape, y_stride, x_stride, dim
+        # ((0,), (0,), (0,), (0,), 0),
+        ((1,), (32,), None, None, 0),
+        ((1, 4), (1, 4), None, None, 0),
+        ((1, 1), (1, 4), None, None, 1),
+        ((16, 1), (16, 2048), None, None, 1),
+        ((1, 16), (2048, 16), None, None, 0),
+        ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1),
+        ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0),
+        ((4, 4, 1), (4, 4, 2048), None, None, 2),
+        ((1, 4, 4), (2048, 4, 4), None, None, 0),
+        ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1),
+        ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0),
+    ]
     _TENSOR_DTYPES_ = [np.float16, np.float32]
 
     for dtype in _TENSOR_DTYPES_:
@@ -107,6 +119,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                 dim,
             )
             test_cases.append(test_case)
-            
+
     test_writer.add_tests(test_cases)
     test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/rms_norm.py b/test/infiniop-test/test_generate/testcases/rms_norm.py
index 681ebafc4..9332c090a 100644
--- a/test/infiniop-test/test_generate/testcases/rms_norm.py
+++ b/test/infiniop-test/test_generate/testcases/rms_norm.py
@@ -1,11 +1,19 @@
 import numpy as np
 from typing import List
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
+
 
 def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray:
     return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001
 
+
 def rms_norm(x: np.ndarray, w: np.ndarray, epsilon: float) -> np.ndarray:
     """
     使用numpy计算rms_norm结果
@@ -16,13 +24,14 @@ def rms_norm(x: np.ndarray, w: np.ndarray, epsilon: float) -> np.ndarray:
     Returns:
         输出张量, 形状与 input 相同
     """
-    squared = x ** 2
+    squared = x**2
     mean = np.mean(squared, axis=-1, keepdims=True)
     rms = np.sqrt(mean + epsilon)
-    
+
     normalized = x / rms
     return normalized * w
 
+
 class RMSNormTestCase(InfiniopTestCase):
     def __init__(
         self,
@@ -40,9 +49,9 @@ def __init__(
         self.y = y
         self.shape = shape
         self.epsilon = epsilon
-        self.x_strides=x_strides
-        self.y_strides=y_strides
-        
+        self.x_strides = x_strides
+        self.y_strides = y_strides
+
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
         test_writer.add_float32(test_writer.gguf_key("epsilon"), self.epsilon)
@@ -50,10 +59,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape)
             test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape)
         if self.x_strides is not None:
-            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides))
+            test_writer.add_array(
+                test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides)
+            )
         test_writer.add_array(
             test_writer.gguf_key("y.strides"),
-            gguf_strides(*self.y_strides if self.y_strides is not None else contiguous_gguf_strides(self.shape))
+            gguf_strides(
+                *(
+                    self.y_strides
+                    if self.y_strides is not None
+                    else contiguous_gguf_strides(self.shape)
+                )
+            ),
         )
         test_writer.add_tensor(
             test_writer.gguf_key("x"),
@@ -70,13 +87,16 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             self.y,
             raw_dtype=np_dtype_to_ggml(self.y.dtype),
         )
-        ans = rms_norm(self.x.astype(np.float64), self.w.astype(np.float64), self.epsilon)
+        ans = rms_norm(
+            self.x.astype(np.float64), self.w.astype(np.float64), self.epsilon
+        )
         test_writer.add_tensor(
             test_writer.gguf_key("ans"),
             ans,
             raw_dtype=np_dtype_to_ggml(np.float64),
         )
 
+
 if __name__ == "__main__":
     test_writer = InfiniopTestWriter("rms_norm.gguf")
     test_cases = []
@@ -112,9 +132,9 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
                 shape=shape,
                 x_strides=x_strides,
                 y_strides=y_strides,
-                epsilon=epsilon
+                epsilon=epsilon,
             )
-            test_cases.append(test_case)        
+            test_cases.append(test_case)
 
     test_writer.add_tests(test_cases)
     test_writer.save()
diff --git a/test/infiniop-test/test_generate/testcases/rope.py b/test/infiniop-test/test_generate/testcases/rope.py
index 85d9685dd..27f5a06db 100644
--- a/test/infiniop-test/test_generate/testcases/rope.py
+++ b/test/infiniop-test/test_generate/testcases/rope.py
@@ -4,11 +4,17 @@
 from typing import List
 
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+)
 
 
 def rotary_embedding(t, sin, cos):
-    dh = t.shape[2] 
+    dh = t.shape[2]
     assert dh % 2 == 0, "Embedding dimension must be even."
 
     t_even = t[..., 0::2]  # [seq_len, n_head, dh // 2]
@@ -30,7 +36,9 @@ def rotary_embedding(t, sin, cos):
 def sin_cos_table(pos, dim, theta, dtype):
     assert dim % 2 == 0, "Embedding dimension must be even."
 
-    freqs = 1.0 / (theta ** (np.arange(0, dim, 2)[: (dim // 2)].astype(np.float32) / dim))
+    freqs = 1.0 / (
+        theta ** (np.arange(0, dim, 2)[: (dim // 2)].astype(np.float32) / dim)
+    )
 
     angles = np.outer(pos, freqs)
 
@@ -79,19 +87,33 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x)
         test_writer.add_array(
             test_writer.gguf_key("y.strides"),
-            gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y))
+            gguf_strides(
+                *(
+                    self.stride_y
+                    if self.stride_y is not None
+                    else contiguous_gguf_strides(self.shape_y)
+                )
+            ),
         )
         if self.stride_x is not None:
-            test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x))
+            test_writer.add_array(
+                test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)
+            )
 
         test_writer.add_tensor(
-            test_writer.gguf_key("pos_ids"), self.pos_ids, raw_dtype=np_dtype_to_ggml(self.pos_ids.dtype)
+            test_writer.gguf_key("pos_ids"),
+            self.pos_ids,
+            raw_dtype=np_dtype_to_ggml(self.pos_ids.dtype),
         )
         test_writer.add_tensor(
-            test_writer.gguf_key("sin_table"), self.sin_table, raw_dtype=np_dtype_to_ggml(self.sin_table.dtype)
+            test_writer.gguf_key("sin_table"),
+            self.sin_table,
+            raw_dtype=np_dtype_to_ggml(self.sin_table.dtype),
         )
         test_writer.add_tensor(
-            test_writer.gguf_key("cos_table"), self.cos_table, raw_dtype=np_dtype_to_ggml(self.cos_table.dtype)
+            test_writer.gguf_key("cos_table"),
+            self.cos_table,
+            raw_dtype=np_dtype_to_ggml(self.cos_table.dtype),
         )
         ans = rotary_embedding(
             self.x.astype(np.float64),
@@ -103,8 +125,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         )
 
 
-
-
 if __name__ == "__main__":
     # ==============================================================================
     #  Configuration (Internal Use Only)
@@ -130,7 +150,9 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
             x = np.random.rand(*shape).astype(dtype)
             y = np.empty(tuple(0 for _ in shape), dtype=dtype)
             pos_ids = np.arange(0, x.shape[0], dtype=np.int32)
-            sin_table, cos_table = sin_cos_table(pos_ids, x.shape[2], theta=1e5, dtype=dtype)
+            sin_table, cos_table = sin_cos_table(
+                pos_ids, x.shape[2], theta=1e5, dtype=dtype
+            )
             test_case = RoPETestCase(
                 y=y,
                 x=x,
diff --git a/test/infiniop-test/test_generate/testcases/swiglu.py b/test/infiniop-test/test_generate/testcases/swiglu.py
index cb692b613..aa3450fed 100644
--- a/test/infiniop-test/test_generate/testcases/swiglu.py
+++ b/test/infiniop-test/test_generate/testcases/swiglu.py
@@ -2,7 +2,14 @@
 import gguf
 from typing import List
 
-from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor
+from .. import (
+    InfiniopTestWriter,
+    InfiniopTestCase,
+    np_dtype_to_ggml,
+    gguf_strides,
+    contiguous_gguf_strides,
+    process_zero_stride_tensor,
+)
 
 
 def swiglu(
@@ -26,7 +33,6 @@ def __init__(
         c: np.ndarray,
         shape_c: List[int] | None,
         stride_c: List[int] | None,
-
     ):
         super().__init__("swiglu")
         self.a = a
@@ -39,7 +45,6 @@ def __init__(
         self.shape_c = shape_c
         self.stride_c = stride_c
 
-
     def write_test(self, test_writer: "InfiniopTestWriter"):
         super().write_test(test_writer)
         if self.shape_a is not None:
@@ -47,14 +52,24 @@ def write_test(self, test_writer: "InfiniopTestWriter"):
         if self.shape_b is not None:
             test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b)
         if self.shape_c is not None:
-            test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)  
+            test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c)
         if self.stride_a is not None:
-            test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a))
+            test_writer.add_array(
+                test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)
+            )
         if self.stride_b is not None:
-            test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b))
+            test_writer.add_array(
+                test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)
+            )
         test_writer.add_array(
             test_writer.gguf_key("c.strides"),
-            gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c))
+            gguf_strides(
+                *(
+                    self.stride_c
+                    if self.stride_c is not None
+                    else contiguous_gguf_strides(self.shape_c)
+                )
+            ),
         )
         test_writer.add_tensor(
             test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 4e3ad70db..72bf31f48 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -454,6 +454,7 @@ def swiglu_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+
 @OpRegister.operator
 def conv_(lib):
     lib.infiniopCreateConvDescriptor.restype = c_int32
@@ -490,6 +491,7 @@ def conv_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+
 @OpRegister.operator
 def reduce_mean_(lib):
     lib.infiniopCreateReduceMeanDescriptor.restype = c_int32
@@ -522,6 +524,7 @@ def reduce_mean_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+
 @OpRegister.operator
 def reduce_max_(lib):
     lib.infiniopCreateReduceMaxDescriptor.restype = c_int32
@@ -552,4 +555,4 @@ def reduce_max_(lib):
     lib.infiniopDestroyReduceMaxDescriptor.restype = c_int32
     lib.infiniopDestroyReduceMaxDescriptor.argtypes = [
         infiniopOperatorDescriptor_t,
-    ]
\ No newline at end of file
+    ]
diff --git a/test/infiniop/reduce_max.py b/test/infiniop/reduce_max.py
index 8e27ae484..00c86f76a 100644
--- a/test/infiniop/reduce_max.py
+++ b/test/infiniop/reduce_max.py
@@ -25,7 +25,7 @@
 _TEST_CASES_ = [
     # y_shape, x_shape, y_stride, x_stride, dim
     ((), (), None, None, 0),
-    ((1, ), (32, ), None, None, 0),
+    ((1,), (32,), None, None, 0),
     ((1, 4), (1, 4), None, None, 0),
     ((1, 1), (1, 4), None, None, 1),
     ((16, 1), (16, 2048), None, None, 1),
@@ -56,7 +56,7 @@
 
 
 def reduce_max(x, dim):
-    return x.max(dim=dim,keepdim=True)[0]
+    return x.max(dim=dim, keepdim=True)[0]
 
 
 def test(
@@ -77,7 +77,7 @@ def test(
 
     x = TestTensor(x_shape, x_stride, dtype, device)
     ans = reduce_max(x.torch_tensor(), dim)
-    
+
     y = TestTensor(y_shape, y_stride, dtype, device)
 
     if sync is not None:
@@ -86,7 +86,11 @@ def test(
     descriptor = infiniopOperatorDescriptor_t()
     check_error(
         LIBINFINIOP.infiniopCreateReduceMaxDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, ctypes.c_size_t(dim)
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+            ctypes.c_size_t(dim),
         )
     )
 
diff --git a/test/infiniop/reduce_mean.py b/test/infiniop/reduce_mean.py
index 3b752b671..c097cd296 100644
--- a/test/infiniop/reduce_mean.py
+++ b/test/infiniop/reduce_mean.py
@@ -25,7 +25,7 @@
 _TEST_CASES_ = [
     # y_shape, x_shape, y_stride, x_stride, dim
     ((), (), None, None, 0),
-    ((1, ), (32, ), None, None, 0),
+    ((1,), (32,), None, None, 0),
     ((1, 4), (1, 4), None, None, 0),
     ((1, 1), (1, 4), None, None, 1),
     ((16, 1), (16, 2048), None, None, 1),
@@ -56,7 +56,7 @@
 
 
 def reduce_mean(x, dim):
-    return x.mean(dim=dim,keepdim=True)
+    return x.mean(dim=dim, keepdim=True)
 
 
 def test(
@@ -86,7 +86,11 @@ def test(
     descriptor = infiniopOperatorDescriptor_t()
     check_error(
         LIBINFINIOP.infiniopCreateReduceMeanDescriptor(
-            handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, ctypes.c_size_t(dim)
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+            ctypes.c_size_t(dim),
         )
     )
 

From 8cd37e3fd6563209f78a75da953ad6144712b2a6 Mon Sep 17 00:00:00 2001
From: S-hhhhh <2320230838@mail.nankai.edu.cn>
Date: Sun, 24 Aug 2025 15:56:20 +0800
Subject: [PATCH 10/11] formated

---
 src/infiniop-test/include/ops.hpp | 33 ++++++++++++++++---------------
 src/utils.h                       |  4 ++--
 2 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index bd3a3fd38..aee602aac 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -32,21 +32,22 @@ DECLARE_INFINIOP_TEST(reduce_max)
 /*
  * Register all the tests here
  */
-#define TEST_BUILDER_MAPPINGS                                                           \
-    {                                                                                   \
-        REGISTER_INFINIOP_TEST(gemm)                                                    \
-            REGISTER_INFINIOP_TEST(random_sample)                                       \
-                REGISTER_INFINIOP_TEST(add)                                             \
-                    REGISTER_INFINIOP_TEST(mul)                                         \
-                        REGISTER_INFINIOP_TEST(clip)                                    \
-                            REGISTER_INFINIOP_TEST(swiglu)                              \
-                                REGISTER_INFINIOP_TEST(rope)                            \
-                                    REGISTER_INFINIOP_TEST(rms_norm)                    \
-                                        REGISTER_INFINIOP_TEST(causal_softmax)          \
-                                            REGISTER_INFINIOP_TEST(rearrange)           \
-                                                REGISTER_INFINIOP_TEST(sub)             \
-                                                    REGISTER_INFINIOP_TEST(reduce_mean) \
-                                                        REGISTER_INFINIOP_TEST(reduce_max)}
+#define TEST_BUILDER_MAPPINGS                  \
+    {                                          \
+        REGISTER_INFINIOP_TEST(gemm)           \
+        REGISTER_INFINIOP_TEST(random_sample)  \
+        REGISTER_INFINIOP_TEST(add)            \
+        REGISTER_INFINIOP_TEST(mul)            \
+        REGISTER_INFINIOP_TEST(clip)           \
+        REGISTER_INFINIOP_TEST(swiglu)         \
+        REGISTER_INFINIOP_TEST(rope)           \
+        REGISTER_INFINIOP_TEST(rms_norm)       \
+        REGISTER_INFINIOP_TEST(causal_softmax) \
+        REGISTER_INFINIOP_TEST(rearrange)      \
+        REGISTER_INFINIOP_TEST(sub)            \
+        REGISTER_INFINIOP_TEST(reduce_mean)    \
+        REGISTER_INFINIOP_TEST(reduce_max)     \
+    }
 
 namespace infiniop_test {
 
@@ -67,4 +68,4 @@ bool check_names(
 
 } // namespace infiniop_test
 
-#endif
+#endif
\ No newline at end of file
diff --git a/src/utils.h b/src/utils.h
index af222baca..e721f05a6 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -98,7 +98,7 @@ inline std::string infiniDtypeToString(infiniDtype_t dtype) {
     }
 }
 
-#define CEIL_DIV(x, y) (((x) + (y) - 1) / (y))
+#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
 
 namespace utils {
 
@@ -108,4 +108,4 @@ inline size_t align(size_t size, size_t alignment) {
 
 } // namespace utils
 
-#endif
+#endif
\ No newline at end of file

From 4d68243d644c9b564d67383c7edd54f97109bf84 Mon Sep 17 00:00:00 2001
From: S-hhhhh <2320230838@mail.nankai.edu.cn>
Date: Sun, 24 Aug 2025 22:46:52 +0800
Subject: [PATCH 11/11] fix: windows build

---
 .../ops/reduce_max/cpu/reduce_max_cpu.cc      | 34 ++++++++++++++++---
 .../ops/reduce_mean/cpu/reduce_mean_cpu.cc    | 34 ++++++++++++++++---
 2 files changed, 58 insertions(+), 10 deletions(-)

diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc
index 5ed28d75e..55a340226 100644
--- a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc
+++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc
@@ -20,11 +20,7 @@ infiniStatus_t Descriptor::create(
 
 template <typename T>
 infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) {
-    const size_t batch_size = info->shape[0];
-    const size_t channels = info->shape[1];
-    const size_t rows = info->shape[2];
     const size_t cols = info->shape[3]; // 规约维度
-
     const ptrdiff_t y_batch_stride = info->y_strides[0];
     const ptrdiff_t y_channel_stride = info->y_strides[1];
     const ptrdiff_t y_row_stride = info->y_strides[2];
@@ -32,7 +28,35 @@ infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) {
     const ptrdiff_t x_channel_stride = info->x_strides[1];
     const ptrdiff_t x_row_stride = info->x_strides[2];
     const ptrdiff_t x_col_stride = info->x_strides[3];
+#ifdef _WIN32
+    const ptrdiff_t batch_size = static_cast<ptrdiff_t>(info->shape[0]);
+    const ptrdiff_t channels = static_cast<ptrdiff_t>(info->shape[1]);
+    const ptrdiff_t rows = static_cast<ptrdiff_t>(info->shape[2]);
+#pragma omp parallel for
+    for (ptrdiff_t batch = 0; batch < batch_size; ++batch) {
+        for (ptrdiff_t channel = 0; channel < channels; ++channel) {
+            for (ptrdiff_t row = 0; row < rows; ++row) {
+                const T *input_start = x + batch * x_batch_stride
+                                     + channel * x_channel_stride
+                                     + row * x_row_stride;
+                T *output_ptr = y + batch * y_batch_stride
+                              + channel * y_channel_stride
+                              + row * y_row_stride;
+
+                float max = op::common_cpu::reduce_op::max(input_start, cols, x_col_stride);
 
+                if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                    *output_ptr = utils::cast<T>(max);
+                } else {
+                    *output_ptr = max;
+                }
+            }
+        }
+    }
+#else
+    const size_t batch_size = info->shape[0];
+    const size_t channels = info->shape[1];
+    const size_t rows = info->shape[2];
 #pragma omp parallel for collapse(3)
     for (size_t batch = 0; batch < batch_size; ++batch) {
         for (size_t channel = 0; channel < channels; ++channel) {
@@ -54,7 +78,7 @@ infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) {
             }
         }
     }
-
+#endif //_WIN32
     return INFINI_STATUS_SUCCESS;
 }
 infiniStatus_t Descriptor::calculate(
diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
index 5eb2b5419..7853afdbb 100644
--- a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
+++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc
@@ -20,11 +20,7 @@ infiniStatus_t Descriptor::create(
 
 template <typename T>
 infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) {
-    const size_t batch_size = info->shape[0];
-    const size_t channels = info->shape[1];
-    const size_t rows = info->shape[2];
     const size_t cols = info->shape[3]; // 规约维度
-
     const ptrdiff_t y_batch_stride = info->y_strides[0];
     const ptrdiff_t y_channel_stride = info->y_strides[1];
     const ptrdiff_t y_row_stride = info->y_strides[2];
@@ -32,7 +28,35 @@ infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) {
     const ptrdiff_t x_channel_stride = info->x_strides[1];
     const ptrdiff_t x_row_stride = info->x_strides[2];
     const ptrdiff_t x_col_stride = info->x_strides[3];
+#ifdef _WIN32
+    const ptrdiff_t batch_size = static_cast<ptrdiff_t>(info->shape[0]);
+    const ptrdiff_t channels = static_cast<ptrdiff_t>(info->shape[1]);
+    const ptrdiff_t rows = static_cast<ptrdiff_t>(info->shape[2]);
+#pragma omp parallel for
+    for (ptrdiff_t batch = 0; batch < batch_size; ++batch) {
+        for (ptrdiff_t channel = 0; channel < channels; ++channel) {
+            for (ptrdiff_t row = 0; row < rows; ++row) {
+                const T *input_start = x + batch * x_batch_stride
+                                     + channel * x_channel_stride
+                                     + row * x_row_stride;
+                T *output_ptr = y + batch * y_batch_stride
+                              + channel * y_channel_stride
+                              + row * y_row_stride;
+
+                float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols;
 
+                if constexpr (std::is_same<T, fp16_t>::value || std::is_same<T, bf16_t>::value) {
+                    *output_ptr = utils::cast<T>(mean);
+                } else {
+                    *output_ptr = mean;
+                }
+            }
+        }
+    }
+#else
+    const size_t batch_size = info->shape[0];
+    const size_t channels = info->shape[1];
+    const size_t rows = info->shape[2];
 #pragma omp parallel for collapse(3)
     for (size_t batch = 0; batch < batch_size; ++batch) {
         for (size_t channel = 0; channel < channels; ++channel) {
@@ -54,7 +78,7 @@ infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) {
             }
         }
     }
-
+#endif //_WIN32
     return INFINI_STATUS_SUCCESS;
 }
 infiniStatus_t Descriptor::calculate(