From f262825fa3d5fc5a3e2b8cd6c496eedc05c7b144 Mon Sep 17 00:00:00 2001 From: S-hhhhh <2320230838@mail.nankai.edu.cn> Date: Wed, 13 Aug 2025 22:46:58 +0800 Subject: [PATCH 01/11] feat: add equal_nan param --- src/infiniop-test/include/test.hpp | 71 ++++++++++---------- src/infiniop-test/src/gguf.cpp | 4 +- src/infiniop-test/src/main.cpp | 14 +++- src/infiniop-test/src/ops/add.cpp | 8 +-- src/infiniop-test/src/ops/causal_softmax.cpp | 8 +-- src/infiniop-test/src/ops/clip.cpp | 8 +-- src/infiniop-test/src/ops/gemm.cpp | 8 +-- src/infiniop-test/src/ops/mul.cpp | 8 +-- src/infiniop-test/src/ops/random_sample.cpp | 8 +-- src/infiniop-test/src/ops/rearrange.cpp | 4 +- src/infiniop-test/src/ops/rms_norm.cpp | 8 +-- src/infiniop-test/src/ops/rope.cpp | 8 +-- src/infiniop-test/src/ops/sub.cpp | 8 +-- src/infiniop-test/src/ops/swiglu.cpp | 8 +-- src/infiniop-test/src/test.cpp | 29 +++++--- 15 files changed, 113 insertions(+), 89 deletions(-) diff --git a/src/infiniop-test/include/test.hpp b/src/infiniop-test/include/test.hpp index e2dd45f9f..58b56b877 100644 --- a/src/infiniop-test/include/test.hpp +++ b/src/infiniop-test/include/test.hpp @@ -47,7 +47,7 @@ std::vector> runAllTests( const GGUFFileReader &, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations, - double rtol, double atol); + double rtol, double atol, bool equal_nan=false); // Run a single test read from a GGUF file std::shared_ptr runTest( @@ -55,10 +55,11 @@ std::shared_ptr runTest( infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations, double rtol, double atol, - size_t test_id); + size_t test_id, + bool equal_nan=false); // Check if two tensors are close within given tolerance -void allClose(std::shared_ptr actual, std::shared_ptr expected, double rtol = 1e-3, double atol = 1e-3); +void allClose(std::shared_ptr actual, std::shared_ptr expected, double rtol = 1e-3, double atol = 1e-3, bool equal_nan = false); // Check if two tensors are equal void allEqual(std::shared_ptr actual, std::shared_ptr expected); @@ -81,43 +82,45 @@ class Test { } // namespace infiniop_test::base // Quick macro for declaring a new testcase -#define DECLARE_INFINIOP_TEST(name) \ - namespace infiniop_test::name { \ - class Test : public infiniop_test::base::Test { \ - double _rtol, _atol; \ - \ - public: \ - static std::string op_name() { return #name; } \ - static std::shared_ptr build( \ - std::unordered_map> attributes, \ - std::unordered_map> tensors, \ - double, double); \ - \ - static std::vector attribute_names(); \ - static std::vector tensor_names(); \ - static std::vector output_names(); \ - \ - std::shared_ptr run( \ - infiniopHandle_t handle, infiniDevice_t device, int device_id, \ - size_t warm_ups, size_t iterations) override; \ - \ - std::string toString() const override; \ - \ - ~Test(); \ - \ - private: \ - struct Attributes; \ - Attributes *_attributes; \ - Test() = delete; \ - Test(double rtol, double atol) : _rtol(rtol), _atol(atol) {} \ - }; \ +#define DECLARE_INFINIOP_TEST(name) \ + namespace infiniop_test::name { \ + class Test : public infiniop_test::base::Test { \ + double _rtol, _atol; \ + bool _equal_nan; \ + \ + public: \ + static std::string op_name() { return #name; } \ + static std::shared_ptr build( \ + std::unordered_map> attributes, \ + std::unordered_map> tensors, \ + double, double, bool); \ + \ + static std::vector attribute_names(); \ + static std::vector tensor_names(); \ + static std::vector output_names(); \ + \ + std::shared_ptr run( \ + infiniopHandle_t handle, infiniDevice_t device, int device_id, \ + size_t warm_ups, size_t iterations) override; \ + \ + std::string toString() const override; \ + \ + ~Test(); \ + \ + private: \ + struct Attributes; \ + Attributes *_attributes; \ + Test() = delete; \ + Test(double rtol, double atol, bool equal_nan = false) \ + : _rtol(rtol), _atol(atol), _equal_nan(equal_nan) {} \ + }; \ } namespace infiniop_test { using BuilderFunc = std::function( std::unordered_map>, std::unordered_map>, - double, double)>; + double, double, bool)>; // Testcase Registry // Each testcase should provid a formatted builder, attribute names, and tensor names diff --git a/src/infiniop-test/src/gguf.cpp b/src/infiniop-test/src/gguf.cpp index a4b200033..aee5b39a8 100644 --- a/src/infiniop-test/src/gguf.cpp +++ b/src/infiniop-test/src/gguf.cpp @@ -53,7 +53,9 @@ GGUFFileReader::GGUFFileReader(const std::string &filepath) { try { _file = std::make_shared(filepath); } catch (const std::exception &e) { - throw e; + // throw e; + std::cerr << "Error: " << e.what() << std::endl; + // throw e; } _data = _file->ptr(); _cursor = reinterpret_cast(_data); diff --git a/src/infiniop-test/src/main.cpp b/src/infiniop-test/src/main.cpp index 4863c8172..9436cd85a 100644 --- a/src/infiniop-test/src/main.cpp +++ b/src/infiniop-test/src/main.cpp @@ -2,7 +2,7 @@ #include "test.hpp" #include #include - +#include struct ParsedArgs { std::string file_path; // Mandatory argument: test.gguf file path infiniDevice_t device_type = INFINI_DEVICE_CPU; // Default to CPU @@ -11,12 +11,13 @@ struct ParsedArgs { int iterations = 0; // Default to 0 if not given double atol = 0.001; // Default absolute tolerance double rtol = 0.001; // Default relative tolerance + bool equal_nan = false; // Default relative tolerance }; void printUsage() { std::cout << "Usage:" << std::endl << std::endl; - std::cout << "infiniop-test [--[:id]] [--warmup ] [--run ] [--atol ] [--rtol ]" << std::endl + std::cout << "infiniop-test [--[:id]] [--warmup ] [--run ] [--atol ] [--rtol ] [--equal-nan ]" << std::endl << std::endl; std::cout << " >" << std::endl; std::cout << " Path to the test gguf file" << std::endl @@ -36,6 +37,9 @@ void printUsage() { std::cout << " --rtol " << std::endl; std::cout << " (Optional) Relative tolerance for correctness check. Default to 0.001" << std::endl << std::endl; + std::cout << " --equal-nan " << std::endl; + std::cout << " (Optional) If True, then two NaNs will be considered equal. Default to False" << std::endl + << std::endl; exit(-1); } @@ -91,6 +95,10 @@ ParsedArgs parseArgs(int argc, char *argv[]) { else if (arg == "--rtol" && i + 1 < argc) { args.rtol = std::stod(argv[++i]); } + else if (arg == "--equal-nan" && i + 1 < argc) { + args.equal_nan = (strcmp(argv[++i], "True") == 0 || strcmp(argv[i], "true") == 0) + ? true : false; + } else { printUsage(); } @@ -119,7 +127,7 @@ int main(int argc, char *argv[]) { reader, (infiniDevice_t)args.device_type, args.device_id, args.warmups, args.iterations, - args.rtol, args.atol); + args.rtol, args.atol, args.equal_nan); std::cout << "=====================================" << std::endl; for (auto result : results) { diff --git a/src/infiniop-test/src/ops/add.cpp b/src/infiniop-test/src/ops/add.cpp index 27f69d687..e90290d55 100644 --- a/src/infiniop-test/src/ops/add.cpp +++ b/src/infiniop-test/src/ops/add.cpp @@ -15,8 +15,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("a") == tensors.end() || tensors.find("b") == tensors.end() @@ -58,7 +58,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -98,7 +98,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/causal_softmax.cpp b/src/infiniop-test/src/ops/causal_softmax.cpp index 29612960a..97c65ef8c 100644 --- a/src/infiniop-test/src/ops/causal_softmax.cpp +++ b/src/infiniop-test/src/ops/causal_softmax.cpp @@ -14,8 +14,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("x") == tensors.end() || tensors.find("y") == tensors.end() @@ -53,7 +53,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(y, _attributes->ans, _rtol, _atol); + allClose(y, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -92,7 +92,7 @@ std::string Test::toString() const { oss << "- y: " << _attributes->y->info() << std::endl; oss << "- ans: " << _attributes->ans->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/clip.cpp b/src/infiniop-test/src/ops/clip.cpp index 82a0e9b10..a01c18a4d 100644 --- a/src/infiniop-test/src/ops/clip.cpp +++ b/src/infiniop-test/src/ops/clip.cpp @@ -16,8 +16,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("x") == tensors.end() || tensors.find("min_val") == tensors.end() @@ -64,7 +64,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(y, _attributes->ans, _rtol, _atol); + allClose(y, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -109,7 +109,7 @@ std::string Test::toString() const { oss << "- max_val: " << _attributes->max_val->info() << std::endl; oss << "- y: " << _attributes->y->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/gemm.cpp b/src/infiniop-test/src/ops/gemm.cpp index 37c8ed6fe..664288d73 100644 --- a/src/infiniop-test/src/ops/gemm.cpp +++ b/src/infiniop-test/src/ops/gemm.cpp @@ -18,8 +18,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { throw std::runtime_error("Invalid Test"); @@ -65,7 +65,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -120,7 +120,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/mul.cpp b/src/infiniop-test/src/ops/mul.cpp index 8ebfc426b..cb0b639bf 100644 --- a/src/infiniop-test/src/ops/mul.cpp +++ b/src/infiniop-test/src/ops/mul.cpp @@ -15,8 +15,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("a") == tensors.end() || tensors.find("b") == tensors.end() @@ -58,7 +58,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -98,7 +98,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/random_sample.cpp b/src/infiniop-test/src/ops/random_sample.cpp index a11e0f446..75ee07b44 100644 --- a/src/infiniop-test/src/ops/random_sample.cpp +++ b/src/infiniop-test/src/ops/random_sample.cpp @@ -20,8 +20,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { throw std::runtime_error("Invalid Test"); @@ -70,7 +70,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(result, _attributes->ans, _rtol, _atol); + allClose(result, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -117,7 +117,7 @@ std::string Test::toString() const { oss << "- data: " << _attributes->data->info() << std::endl; oss << "- result: " << _attributes->result->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/rearrange.cpp b/src/infiniop-test/src/ops/rearrange.cpp index 9fbf6f2cb..bdf162ce2 100644 --- a/src/infiniop-test/src/ops/rearrange.cpp +++ b/src/infiniop-test/src/ops/rearrange.cpp @@ -12,9 +12,9 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { + double rtol, double atol, bool equal_nan) { - auto test = std::shared_ptr(new Test(rtol, atol)); + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) { throw std::runtime_error("Invalid Test"); diff --git a/src/infiniop-test/src/ops/rms_norm.cpp b/src/infiniop-test/src/ops/rms_norm.cpp index 8359a4536..786ce8470 100644 --- a/src/infiniop-test/src/ops/rms_norm.cpp +++ b/src/infiniop-test/src/ops/rms_norm.cpp @@ -16,8 +16,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (attributes.find("epsilon") == attributes.end() @@ -72,7 +72,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "RMSNorm execution failed")); try { - allClose(y, _attributes->ans, _rtol, _atol); + allClose(y, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -117,7 +117,7 @@ std::string Test::toString() const { oss << "- w: " << _attributes->w->info() << std::endl; oss << "- y: " << _attributes->y->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/rope.cpp b/src/infiniop-test/src/ops/rope.cpp index 636f565af..94010a122 100644 --- a/src/infiniop-test/src/ops/rope.cpp +++ b/src/infiniop-test/src/ops/rope.cpp @@ -17,8 +17,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("y") == tensors.end() @@ -77,7 +77,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(y, _attributes->ans, _rtol, _atol); + allClose(y, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -121,7 +121,7 @@ std::string Test::toString() const { oss << "- sin_table: " << _attributes->sin_table->info() << std::endl; oss << "- cos_table: " << _attributes->cos_table->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/sub.cpp b/src/infiniop-test/src/ops/sub.cpp index 6bb1fd1eb..bb3adc350 100644 --- a/src/infiniop-test/src/ops/sub.cpp +++ b/src/infiniop-test/src/ops/sub.cpp @@ -15,8 +15,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("a") == tensors.end() || tensors.find("b") == tensors.end() @@ -58,7 +58,7 @@ std::shared_ptr Test::run( return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -98,7 +98,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/ops/swiglu.cpp b/src/infiniop-test/src/ops/swiglu.cpp index 96b75efc5..f86dfadc6 100644 --- a/src/infiniop-test/src/ops/swiglu.cpp +++ b/src/infiniop-test/src/ops/swiglu.cpp @@ -15,8 +15,8 @@ struct Test::Attributes { std::shared_ptr Test::build( std::unordered_map> attributes, std::unordered_map> tensors, - double rtol, double atol) { - auto test = std::shared_ptr(new Test(rtol, atol)); + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); test->_attributes = new Attributes(); if (tensors.find("a") == tensors.end() @@ -54,7 +54,7 @@ std::shared_ptr Test::run( CHECK_OR(infiniopSwiGLU(op_desc, workspace, workspace_size, c->data(), a->data(), b->data(), nullptr), return TEST_FAILED(OP_CREATION_FAILED, "Failed during execution.")); try { - allClose(c, _attributes->ans, _rtol, _atol); + allClose(c, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -93,7 +93,7 @@ std::string Test::toString() const { oss << "- b: " << _attributes->b->info() << std::endl; oss << "- c: " << _attributes->c->info() << std::endl; oss << std::scientific << std::setprecision(2); - oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl; + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; return oss.str(); } diff --git a/src/infiniop-test/src/test.cpp b/src/infiniop-test/src/test.cpp index e312ac5f5..d4c0bab49 100644 --- a/src/infiniop-test/src/test.cpp +++ b/src/infiniop-test/src/test.cpp @@ -49,7 +49,7 @@ std::string Result::toString() const { std::vector> runAllTests(const GGUFFileReader &gguf_reader, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations, - double rtol, double atol) { + double rtol, double atol, bool equal_nan) { auto meta = gguf_reader.getAttributeMap(); auto count_meta = meta.find("test_count"); if (count_meta == meta.end()) { @@ -60,7 +60,7 @@ std::vector> runAllTests(const GGUFFileReader &gguf_read auto results = std::vector>(count); try { for (size_t i = 0; i < count; i++) { - results[i] = runTest(gguf_reader, device, device_id, warm_ups, iterations, rtol, atol, i); + results[i] = runTest(gguf_reader, device, device_id, warm_ups, iterations, rtol, atol, i, equal_nan); } } catch (const std::exception &e) { std::cerr << "Error: " << e.what() << std::endl; @@ -72,7 +72,7 @@ std::vector> runAllTests(const GGUFFileReader &gguf_read std::shared_ptr runTest(const GGUFFileReader &gguf_reader, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations, - double rtol, double atol, size_t test_id) { + double rtol, double atol, size_t test_id, bool equal_nan) { auto meta = gguf_reader.getAttributeMap(); auto tensor_info = gguf_reader.getTensorInfoMap(); auto name_meta = meta.find("test." + std::to_string(test_id) + ".op_name"); @@ -107,7 +107,7 @@ std::shared_ptr runTest(const GGUFFileReader &gguf_reader, } std::shared_ptr test; try { - test = builder.build(attrs, tensors, rtol, atol); + test = builder.build(attrs, tensors, rtol, atol, equal_nan); } catch (const std::exception &e) { return TEST_INIT_FAILED(op_name + "/n" + e.what()); } @@ -141,7 +141,7 @@ void incrementOffset(ptrdiff_t &offset_1, const std::vector &strides_ } } -void allClose(std::shared_ptr actual_, std::shared_ptr expected_, double rtol, double atol) { +void allClose(std::shared_ptr actual_, std::shared_ptr expected_, double rtol, double atol, bool equal_nan) { auto actual = actual_->to(INFINI_DEVICE_CPU); auto expected = expected_->to(INFINI_DEVICE_CPU); auto shape = actual->shape(); @@ -158,12 +158,23 @@ void allClose(std::shared_ptr actual_, std::shared_ptr expected_ for (size_t i = 0; i < total; i++) { double a_ = getVal((char *)actual->data() + actual_offset, actual->ggml_type()); double e_ = getVal((char *)expected->data() + expected_offset, expected->ggml_type()); - if (std::fabs(a_ - e_) > atol && std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) { - if (num_failed == 0) { - first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + "."; + if (std::isnan(a_) || std::isnan(e_)){ + if ((equal_nan && (std::isnan(a_) != std::isnan(e_))) || !equal_nan){ + num_failed ++; + if (num_failed == 0) { + first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + "."; + } + } + } + else{ + if (std::fabs(a_ - e_) > atol && std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) { + if (num_failed == 0) { + first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + "."; + } + num_failed++; } - num_failed++; } + incrementOffset(actual_offset, actual->strides(), ggmlTypeSize(actual->ggml_type()), expected_offset, expected->strides(), ggmlTypeSize(expected->ggml_type()), counter, shape); From 53927875e6f286c05fcbdb47d781f9016248608b Mon Sep 17 00:00:00 2001 From: S-hhhhh <2320230838@mail.nankai.edu.cn> Date: Thu, 14 Aug 2025 22:23:20 +0800 Subject: [PATCH 02/11] =?UTF-8?q?=E6=B7=BB=E5=8A=A0reduce=5Fmean=E7=AE=97?= =?UTF-8?q?=E5=AD=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/infiniop.h | 1 + include/infiniop/ops/reduce_mean.h | 27 +++ scripts/python_test.py | 1 + src/infiniop-test/include/ops.hpp | 2 + src/infiniop-test/src/ops/reduce_mean.cpp | 119 ++++++++++++ .../ops/reduce_mean/cpu/reduce_mean_cpu.cc | 67 +++++++ .../ops/reduce_mean/cpu/reduce_mean_cpu.h | 7 + src/infiniop/ops/reduce_mean/info.h | 81 ++++++++ .../ops/reduce_mean/metax/reduce_mean_metax.h | 8 + .../reduce_mean/metax/reduce_mean_metax.maca | 94 +++++++++ .../ops/reduce_mean/nvidia/kernel.cuh | 26 +++ .../reduce_mean/nvidia/reduce_mean_nvidia.cu | 98 ++++++++++ .../reduce_mean/nvidia/reduce_mean_nvidia.cuh | 8 + src/infiniop/ops/reduce_mean/operator.cc | 182 ++++++++++++++++++ src/infiniop/ops/reduce_mean/reduce_mean.h | 47 +++++ src/infiniop/reduce/cuda/reduce.cuh | 18 ++ src/utils/check.h | 18 ++ .../test_generate/testcases/reduce_mean.py | 111 +++++++++++ test/infiniop/libinfiniop/op_register.py | 32 +++ test/infiniop/reduce_mean.py | 150 +++++++++++++++ 20 files changed, 1097 insertions(+) create mode 100644 include/infiniop/ops/reduce_mean.h create mode 100644 src/infiniop-test/src/ops/reduce_mean.cpp create mode 100644 src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc create mode 100644 src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h create mode 100644 src/infiniop/ops/reduce_mean/info.h create mode 100644 src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h create mode 100644 src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca create mode 100644 src/infiniop/ops/reduce_mean/nvidia/kernel.cuh create mode 100644 src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu create mode 100644 src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh create mode 100644 src/infiniop/ops/reduce_mean/operator.cc create mode 100644 src/infiniop/ops/reduce_mean/reduce_mean.h create mode 100644 test/infiniop-test/test_generate/testcases/reduce_mean.py create mode 100644 test/infiniop/reduce_mean.py diff --git a/include/infiniop.h b/include/infiniop.h index d51b8d92e..85df0c703 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -17,5 +17,6 @@ #include "infiniop/ops/sub.h" #include "infiniop/ops/swiglu.h" #include "infiniop/tensor_descriptor.h" +#include "infiniop/ops/reduce_mean.h" #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/reduce_mean.h b/include/infiniop/ops/reduce_mean.h new file mode 100644 index 000000000..5efd8b227 --- /dev/null +++ b/include/infiniop/ops/reduce_mean.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_REDUCE_MEAN_API_H__ +#define __INFINIOP_REDUCE_MEAN_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopReduceMeanDescriptor_t; + +__C __export infiniStatus_t infiniopCreateReduceMeanDescriptor( + infiniopHandle_t handle, + infiniopReduceMeanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim); + +__C __export infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopReduceMean( + infiniopReduceMeanDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *y, + const void *x, + void *stream); + +__C __export infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc); + +#endif diff --git a/scripts/python_test.py b/scripts/python_test.py index eb2d4319e..14588cd7d 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -24,6 +24,7 @@ def run_tests(args): "rope.py", "sub.py", "swiglu.py", + "reduce_mean.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 3820f7cfd..c94ebac90 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -16,6 +16,7 @@ DECLARE_INFINIOP_TEST(add) DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(sub) +DECLARE_INFINIOP_TEST(reduce_mean) #define REGISTER_INFINIOP_TEST(name) \ { \ @@ -43,6 +44,7 @@ DECLARE_INFINIOP_TEST(sub) REGISTER_INFINIOP_TEST(causal_softmax) \ REGISTER_INFINIOP_TEST(rearrange) \ REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(reduce_mean) \ } namespace infiniop_test { diff --git a/src/infiniop-test/src/ops/reduce_mean.cpp b/src/infiniop-test/src/ops/reduce_mean.cpp new file mode 100644 index 000000000..2c3ddceaf --- /dev/null +++ b/src/infiniop-test/src/ops/reduce_mean.cpp @@ -0,0 +1,119 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::reduce_mean { +struct Test::Attributes { + std::shared_ptr x; + std::shared_ptr y; + std::shared_ptr ans; + size_t dim; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); + test->_attributes = new Attributes(); + + if (attributes.find("dim") == attributes.end() + || tensors.find("x") == tensors.end() + || tensors.find("ans") == tensors.end() + || tensors.find("y") == tensors.end()) { + throw std::runtime_error("Invalid Test: Missing attributes or tensors"); + } + + test->_attributes->dim = size_t(*reinterpret_cast(attributes["dim"].data())); + test->_attributes->ans = tensors["ans"]; + test->_attributes->x = tensors["x"]; + test->_attributes->y = tensors["y"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopReduceMeanDescriptor_t op_desc; + CHECK_OR(infiniopCreateReduceMeanDescriptor(handle, &op_desc, + _attributes->y->desc(), + _attributes->x->desc(), + _attributes->dim), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create ReduceMean descriptor")); + + auto x = _attributes->x->to(device, device_id); + auto y = _attributes->y->to(device, device_id); + + size_t workspace_size; + CHECK_OR(infiniopGetReduceMeanWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size")); + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace")); + } + + CHECK_OR(infiniopReduceMean(op_desc, + workspace, workspace_size, + y->data(), + x->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "ReduceMean execution failed")); + + try { + allClose(y, _attributes->ans, _rtol, _atol, _equal_nan); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopReduceMean(op_desc, + workspace, workspace_size, + y->data(), + x->data(), + nullptr); + }, + warm_ups, iterations); + + if (workspace != nullptr) { + infinirtFree(workspace); + } + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"dim"}; +} + +std::vector Test::tensor_names() { + return {"x", "ans", "y"}; +} + +std::vector Test::output_names() { + return {"y"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- x: " << _attributes->x->info() << std::endl; + oss << "- y: " << _attributes->y->info() << std::endl; + oss << "- dim=" << _attributes->dim << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::reduce_mean diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc new file mode 100644 index 000000000..3682d7631 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc @@ -0,0 +1,67 @@ +#include "reduce_mean_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" + +namespace op::reduce_mean::cpu { + +Descriptor::~Descriptor() {} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto result = ReduceMeanInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(result); + *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) { + const size_t batch_size = info->shape[0]; + const size_t rows = info->shape[1]; + const size_t cols = info->shape[2]; // 最后一维(规约维度) + + const ptrdiff_t y_batch_stride = info->y_strides[0]; + const ptrdiff_t y_row_stride = info->y_strides[1]; + const ptrdiff_t x_batch_stride = info->x_strides[0]; + const ptrdiff_t x_row_stride = info->x_strides[1]; + const ptrdiff_t x_col_stride = info->x_strides[2]; + + #pragma omp parallel for collapse(2) + for (size_t batch = 0; batch < batch_size; ++batch) { + for (size_t row = 0; row < rows; ++row) { + const T* input_start = x + batch * x_batch_stride + row * x_row_stride; + T* output_ptr = y + batch * y_batch_stride + row * y_row_stride; + float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols; + if constexpr (std::is_same::value || std::is_same::value) { + *output_ptr = utils::cast(mean); + } else { + *output_ptr = mean; + } + } + } + + return INFINI_STATUS_SUCCESS; +} +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + if (_info.dtype == INFINI_DTYPE_F16) { + CHECK_STATUS(reduce_mean(&_info, (fp16_t *)y, (const fp16_t *)x)); + } else if (_info.dtype == INFINI_DTYPE_BF16) { + CHECK_STATUS(reduce_mean(&_info, (bf16_t *)y, (const bf16_t *)x)); + } else if (_info.dtype == INFINI_DTYPE_F32) { + CHECK_STATUS(reduce_mean(&_info, (float *)y, (const float *)x)); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::reduce_mean::cpu diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h new file mode 100644 index 000000000..f67601c62 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.h @@ -0,0 +1,7 @@ +#ifndef __REDUCE_MEAN_CPU_H__ +#define __REDUCE_MEAN_CPU_H__ +#include "../reduce_mean.h" + +DESCRIPTOR(cpu) + +#endif diff --git a/src/infiniop/ops/reduce_mean/info.h b/src/infiniop/ops/reduce_mean/info.h new file mode 100644 index 000000000..41785a082 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/info.h @@ -0,0 +1,81 @@ +#ifndef __REDUCE_MEAN_INFO_H__ +#define __REDUCE_MEAN_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::reduce_mean { + +class ReduceMeanInfo { + ReduceMeanInfo() = default; + +public: + infiniDtype_t dtype; + + std::vector shape; + std::vector y_strides; + std::vector x_strides; + + static utils::Result create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, size_t dim) { + auto dtype = y_desc->dtype(); + if (dtype != x_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + + size_t ndim = y_desc->ndim(); + if (x_desc->ndim() != ndim) { + CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE); + } + CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape()); + if(ndim == 3){ + std::vector shape = x_desc->shape(); + std::vector y_strides = y_desc->strides(); + std::vector x_strides = x_desc->strides(); + if (dim != 2){ + std::swap(shape[dim], shape[2]); + std::swap(y_strides[dim], y_strides[2]); + std::swap(x_strides[dim], x_strides[2]); + } + return utils::Result(ReduceMeanInfo{ + dtype, shape, y_strides, x_strides}); + } + else if (ndim == 2){ + std::vector shape = x_desc->shape(); + std::vector y_strides = y_desc->strides(); + std::vector x_strides = x_desc->strides(); + if (dim != 1){ + std::swap(shape[dim], shape[1]); + std::swap(y_strides[dim], y_strides[1]); + std::swap(x_strides[dim], x_strides[1]); + } + shape.insert(shape.begin(), 1); + y_strides.insert(y_strides.begin(), 0); + x_strides.insert(x_strides.begin(), 0); + return utils::Result(ReduceMeanInfo{ + dtype, shape, y_strides, x_strides}); + } + else if (ndim == 1){ + std::vector shape = {1, 1, (x_desc->shape())[0]}; + std::vector y_strides = {0, 0, (y_desc->strides())[0]}; + std::vector x_strides = {0, 0, (x_desc->strides())[0]}; + return utils::Result(ReduceMeanInfo{ + dtype, shape, y_strides, x_strides}); + } + else if (ndim == 0){ + std::vector shape = {1, 1, 1}; + std::vector y_strides = {0, 0, 0}; + std::vector x_strides = {0, 0, 0}; + return utils::Result(ReduceMeanInfo{ + dtype, shape, y_strides, x_strides}); + } + else{ + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } + } +}; + +} // namespace op::reduce_mean + +#endif // __REDUCE_MEAN_INFO_H__ diff --git a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h new file mode 100644 index 000000000..a105724d6 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.h @@ -0,0 +1,8 @@ +#ifndef __REDUCE_MEAN_METAX_H__ +#define __REDUCE_MEAN_METAX_H__ + +#include "../reduce_mean.h" + +DESCRIPTOR(metax) + +#endif diff --git a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca new file mode 100644 index 000000000..9abd888f6 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca @@ -0,0 +1,94 @@ +#include "../../../devices/metax/metax_common.h" +#include "reduce_mean_metax.h" + +#include +#include "../../../devices/metax/metax_kernel_common.h" + +#include "../../../reduce/cuda/reduce.cuh" + +#include "../nvidia/kernel.cuh" + +template +INFINIOP_METAX_KERNEL ReduceMean( + Tdata *y_, const Tdata *x_, + size_t batch, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ReduceMeanKernel(y_, x_, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h, x_stride_w); +} + +namespace op::reduce_mean::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto info = ReduceMeanInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(info); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t batch_size, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, + hcStream_t stream) { + dim3 grid(uint32_t(batch_size), uint32_t(height), 1); + if (dtype == INFINI_DTYPE_F16) { + ReduceMean + <<>>((half *)y, (const half *)x, + batch_size, height, width, + y_stride_b, y_stride_h, + x_stride_b, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_BF16) { + ReduceMean + <<>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x, + batch_size, height, width, + y_stride_b, y_stride_h, + x_stride_b, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_F32) { + ReduceMean + <<>>((float *)y, (const float *)x, + batch_size, height, width, + y_stride_b, y_stride_h, + x_stride_b, x_stride_h, x_stride_w); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream_) const { + hcStream_t stream = (hcStream_t)stream_; + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], + _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], + _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::reduce_mean::metax diff --git a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh new file mode 100644 index 000000000..c1dbc3fa8 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh @@ -0,0 +1,26 @@ +#ifndef __REDUCE_MEAN_KERNEL_CUH__ +#define __REDUCE_MEAN_KERNEL_CUH__ + +template +__device__ void ReduceMeanKernel( + Tdata *y_, const Tdata *x_, + size_t batch, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + + Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_h; + const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_h; + + // [Reduce] Find the sum of each updated row and store in shared memory + __shared__ Tcompute mean_; + Tcompute sum_0 = op::common_cuda::reduce_op::sum(x, width, x_stride_w); + if (threadIdx.x == 0) { + mean_ = sum_0/width; + } + __syncthreads(); + + // [Elementwise] Divide each element by the sum and store in shared memory + *y = mean_; +} + +#endif // __REDUCE_MEAN_KERNEL_CUH__ diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu new file mode 100644 index 000000000..170c956e9 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu @@ -0,0 +1,98 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "reduce_mean_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include "kernel.cuh" + +template +INFINIOP_CUDA_KERNEL ReduceMean( + Tdata *y_, const Tdata *x_, + size_t batch, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ReduceMeanKernel(y_, x_, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h, x_stride_w); +} + +namespace op::reduce_mean::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto info = ReduceMeanInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(info); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t batch_size, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, + cudaStream_t stream) { + dim3 grid(uint32_t(batch_size), uint32_t(height), 1); + if (dtype == INFINI_DTYPE_F16) { + ReduceMean + <<>>((half *)y, (const half *)x, + batch_size, height, width, + y_stride_b, y_stride_h, + x_stride_b, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_BF16) { + ReduceMean + <<>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, + batch_size, height, width, + y_stride_b, y_stride_h, + x_stride_b, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_F32) { + ReduceMean + <<>>((float *)y, (const float *)x, + batch_size, height, width, + y_stride_b, y_stride_h, + x_stride_b, x_stride_h, x_stride_w); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], + _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], + _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], + _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::reduce_mean::nvidia diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh new file mode 100644 index 000000000..be16b4491 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __REDUCE_MEAN_NVIDIA_H__ +#define __REDUCE_MEAN_NVIDIA_H__ + +#include "../reduce_mean.h" + +DESCRIPTOR(nvidia) + +#endif diff --git a/src/infiniop/ops/reduce_mean/operator.cc b/src/infiniop/ops/reduce_mean/operator.cc new file mode 100644 index 000000000..32e4e3619 --- /dev/null +++ b/src/infiniop/ops/reduce_mean/operator.cc @@ -0,0 +1,182 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/reduce_mean.h" + +#ifdef ENABLE_CPU_API +#include "cpu/reduce_mean_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/reduce_mean_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/reduce_mean_metax.h" +#endif +#ifdef ENABLE_ASCEND_API +#include "ascend/reduce_mean_ascend.h" +#endif + +__C infiniStatus_t infiniopCreateReduceMeanDescriptor( + infiniopHandle_t handle, + infiniopReduceMeanDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::reduce_mean::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + y_desc, \ + x_desc, \ + dim); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax) +#endif +// #ifdef ENABLE_ASCEND_API +// CREATE(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_CAMBRICON_MLU +// case DevCambriconMlu: { +// return bangCreateCausalSoftmaxDescriptor((BangHandle_t)handle, (CausalSoftmaxBangDescriptor_t *)desc_ptr, y_desc); +// // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc); +// } +// #endif +// #ifdef ENABLE_MTHREADS_GPU +// case DevMthreadsGpu: { +// return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc); +// } +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +// #ifdef ENABLE_ASCEND_API +// GET(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_CAMBRICON_MLU +// case DevCambriconMlu: { +// return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t)desc, size); +// // return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size); +// } + +// #endif +// #ifdef ENABLE_MTHREADS_GPU +// case DevMthreadsGpu: { +// return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size); +// } +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopReduceMean( + infiniopReduceMeanDescriptor_t desc, + void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, y, x, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif +// #ifdef ENABLE_ASCEND_API +// CALCULATE(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_CAMBRICON_MLU +// case DevCambriconMlu: { +// return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t)desc, workspace, workspace_size, data, stream); +// // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream); +// } +// #endif +// #ifdef ENABLE_MTHREADS_GPU +// case DevMthreadsGpu: { +// return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream); +// } +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax) +#endif +// #ifdef ENABLE_ASCEND_API +// DESTROY(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_CAMBRICON_MLU +// case DevCambriconMlu: { +// return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t)desc); +// // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc); +// } +// #endif +// #ifdef ENABLE_MTHREADS_GPU +// case DevMthreadsGpu: +// return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc); +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} diff --git a/src/infiniop/ops/reduce_mean/reduce_mean.h b/src/infiniop/ops/reduce_mean/reduce_mean.h new file mode 100644 index 000000000..58fa566ea --- /dev/null +++ b/src/infiniop/ops/reduce_mean/reduce_mean.h @@ -0,0 +1,47 @@ +#ifndef REDUCE_MEAN_H +#define REDUCE_MEAN_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::reduce_mean::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + ReduceMeanInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + ReduceMeanInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t y_desc, \ + infiniopTensorDescriptor_t x_desc, \ + size_t dim); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *y, \ + const void *x, \ + void *stream) const; \ + }; \ + } + +#endif // REDUCE_MEAN_H diff --git a/src/infiniop/reduce/cuda/reduce.cuh b/src/infiniop/reduce/cuda/reduce.cuh index a1d2c2501..0a30a3a26 100644 --- a/src/infiniop/reduce/cuda/reduce.cuh +++ b/src/infiniop/reduce/cuda/reduce.cuh @@ -61,6 +61,24 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) { return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE); } +// Sum(x) on non-contiguous data of length count +template +__device__ __forceinline__ Tcompute sum( + const Tdata *data_ptr, + size_t count, + ptrdiff_t stride) +{ + Tcompute s = 0; + + for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { + s += Tcompute(data_ptr[i * stride]); + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + return BlockReduce(temp_storage).Sum(s); +} } // namespace op::common_cuda::reduce_op #endif diff --git a/src/utils/check.h b/src/utils/check.h index 7f4a2bdd9..72a0995fd 100644 --- a/src/utils/check.h +++ b/src/utils/check.h @@ -59,4 +59,22 @@ #define CHECK_SAME_STRIDES(FIRST, ...) CHECK_SAME_VEC(INFINI_STATUS_BAD_TENSOR_STRIDES, FIRST, __VA_ARGS__) +#define CHECK_REDUCE_SHAPE(INPUT_SHAPE, DIM, EXPECTED_SHAPE) \ + do { \ + if (INPUT_SHAPE.empty()) { \ + if (!EXPECTED_SHAPE.empty()) { \ + return INFINI_STATUS_BAD_TENSOR_SHAPE; \ + } \ + break; \ + } \ + if (DIM >= INPUT_SHAPE.size()) { \ + return INFINI_STATUS_BAD_PARAM; \ + } \ + std::vector reduced_shape = INPUT_SHAPE; \ + reduced_shape[DIM] = 1; \ + if (reduced_shape != EXPECTED_SHAPE) { \ + return INFINI_STATUS_BAD_TENSOR_SHAPE; \ + } \ + } while (0) + #endif // INFINIUTILS_CHECK_H diff --git a/test/infiniop-test/test_generate/testcases/reduce_mean.py b/test/infiniop-test/test_generate/testcases/reduce_mean.py new file mode 100644 index 000000000..0e05d4240 --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/reduce_mean.py @@ -0,0 +1,111 @@ +from ast import List +import numpy as np +import gguf +from typing import List +from enum import Enum, auto + +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides + + +def reduce_mean(x, dim): + if isinstance(x, np.float64): + return x + return x.mean(axis=dim, keepdims=True) + + +def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray: + return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001 + + +class ReduceMeanTestCase(InfiniopTestCase): + def __init__( + self, + x: np.ndarray, + y: np.ndarray, + shape_x: List[int] | None, + shape_y: List[int] | None, + stride_x: List[int] | None, + stride_y: List[int] | None, + dim: int = 0, + ): + super().__init__("reduce_mean") + self.x = x + self.y = y + self.shape_x=shape_x + self.shape_y=shape_y + self.stride_x = stride_x + self.stride_y = stride_y + self.dim = dim + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + print(self.shape_y, self.shape_x, self.stride_y, self.stride_x, self.dim) + if self.shape_x is not None: + test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x) + if self.shape_y is not None: + test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y) + if self.stride_x is not None: + test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)) + test_writer.add_array( + test_writer.gguf_key("y.strides"), + gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y)) + ) + test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim) + test_writer.add_tensor( + test_writer.gguf_key("x"), + self.x, + raw_dtype=np_dtype_to_ggml(self.x.dtype), + ) + test_writer.add_tensor( + test_writer.gguf_key("y"), + self.y, + raw_dtype=np_dtype_to_ggml(self.y.dtype), + ) + ans = reduce_mean( + self.x.astype(np.float64), self.dim + ) + test_writer.add_tensor( + test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64 + ) + + +if __name__ == "__main__": + test_writer = InfiniopTestWriter("reduce_mean.gguf") + test_cases = [] + # ============================================================================== + # Configuration + # ============================================================================== + # These are not meant to be imported from other modules + _TEST_CASES_ = [ + # y_shape, x_shape, y_stride, x_stride, dim + # ((0,), (0,), (0,), (0,), 0), + ((1, ), (32, ), None, None, 0), + ((1, 4), (1, 4), None, None, 0), + ((1, 1), (1, 4), None, None, 1), + ((16, 1), (16, 2048), None, None, 1), + ((1, 16), (2048, 16), None, None, 0), + ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1), + ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0), + ((4, 4, 1), (4, 4, 2048), None, None, 2), + ((1, 4, 4), (2048, 4, 4), None, None, 0), + ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), +] + _TENSOR_DTYPES_ = [np.float16, np.float32] + + for dtype in _TENSOR_DTYPES_: + for shape_y, shape_x, stride_y, stride_x, dim in _TEST_CASES_: + x = random_tensor(shape_x, dtype) + y = np.empty(tuple(0 for _ in shape_y), dtype=dtype) + test_case = ReduceMeanTestCase( + x, + y, + shape_x, + shape_y, + stride_x, + stride_y, + dim, + ) + test_cases.append(test_case) + + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index e92e77105..7f147a8b2 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -489,3 +489,35 @@ def conv_(lib): lib.infiniopDestroyConvDescriptor.argtypes = [ infiniopOperatorDescriptor_t, ] + +@OpRegister.operator +def reduce_mean_(lib): + lib.infiniopCreateReduceMeanDescriptor.restype = c_int32 + lib.infiniopCreateReduceMeanDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + + lib.infiniopGetReduceMeanWorkspaceSize.restype = c_int32 + lib.infiniopGetReduceMeanWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopReduceMean.restype = c_int32 + lib.infiniopReduceMean.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyReduceMeanDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMeanDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, + ] \ No newline at end of file diff --git a/test/infiniop/reduce_mean.py b/test/infiniop/reduce_mean.py new file mode 100644 index 000000000..e56663e44 --- /dev/null +++ b/test/infiniop/reduce_mean.py @@ -0,0 +1,150 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not meant to be imported from other modules +_TEST_CASES_ = [ + # y_shape, x_shape, y_stride, x_stride, dim + ((), (), None, None, 0), + ((1, ), (32, ), None, None, 0), + ((1, 4), (1, 4), None, None, 0), + ((1, 1), (1, 4), None, None, 1), + ((16, 1), (16, 2048), None, None, 1), + ((1, 16), (2048, 16), None, None, 0), + ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1), + ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0), + ((4, 4, 1), (4, 4, 2048), None, None, 2), + ((1, 4, 4), (2048, 4, 4), None, None, 0), + ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), +] + +# x types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +_TEST_CASES = _TEST_CASES_ + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, + InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2}, + InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def reduce_mean(x, dim): + return x.mean(dim=dim,keepdim=True) + + +def test( + handle, + device, + y_shape, + x_shape, + y_stride, + x_stride, + dim, + dtype=InfiniDtype.F16, + sync=None, +): + print( + f"Testing Reduce_Mean on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape}" + f" y_stride:{y_stride} x_stride:{x_stride} dim:{dim} dtype:{InfiniDtypeNames[dtype]}" + ) + + x = TestTensor(x_shape, x_stride, dtype, device) + ans = reduce_mean(x.torch_tensor(), dim) + + y = TestTensor(y_shape, y_stride, dtype, device) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateReduceMeanDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, ctypes.c_size_t(dim) + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x.destroy_desc() + y.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetReduceMeanWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_reduce_mean(): + check_error( + LIBINFINIOP.infiniopReduceMean( + descriptor, + workspace.data(), + workspace_size.value, + y.data(), + x.data(), + None, + ) + ) + + lib_reduce_mean() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: causal_softmax(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_causal_softmax(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyReduceMeanDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + # Execute tests + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") From 73eee8adac37b34ede20ad098102e62c1a2e051e Mon Sep 17 00:00:00 2001 From: S-hhhhh <2320230838@mail.nankai.edu.cn> Date: Sat, 16 Aug 2025 13:30:37 +0800 Subject: [PATCH 03/11] update reduce_mean --- src/infiniop-test/src/ops/reduce_mean.cpp | 39 ++++++++++--------- .../ops/reduce_mean/nvidia/kernel.cuh | 7 ++-- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/src/infiniop-test/src/ops/reduce_mean.cpp b/src/infiniop-test/src/ops/reduce_mean.cpp index 2c3ddceaf..114ab2981 100644 --- a/src/infiniop-test/src/ops/reduce_mean.cpp +++ b/src/infiniop-test/src/ops/reduce_mean.cpp @@ -6,8 +6,8 @@ namespace infiniop_test::reduce_mean { struct Test::Attributes { - std::shared_ptr x; - std::shared_ptr y; + std::shared_ptr input; + std::shared_ptr output; std::shared_ptr ans; size_t dim; }; @@ -20,16 +20,17 @@ std::shared_ptr Test::build( test->_attributes = new Attributes(); if (attributes.find("dim") == attributes.end() - || tensors.find("x") == tensors.end() + || tensors.find("input") == tensors.end() || tensors.find("ans") == tensors.end() - || tensors.find("y") == tensors.end()) { + || tensors.find("output") == tensors.end()) { + std::cout << "所有的键:" << std::endl; throw std::runtime_error("Invalid Test: Missing attributes or tensors"); } test->_attributes->dim = size_t(*reinterpret_cast(attributes["dim"].data())); test->_attributes->ans = tensors["ans"]; - test->_attributes->x = tensors["x"]; - test->_attributes->y = tensors["y"]; + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; return test; } @@ -40,13 +41,13 @@ std::shared_ptr Test::run( infiniopReduceMeanDescriptor_t op_desc; CHECK_OR(infiniopCreateReduceMeanDescriptor(handle, &op_desc, - _attributes->y->desc(), - _attributes->x->desc(), + _attributes->output->desc(), + _attributes->input->desc(), _attributes->dim), return TEST_FAILED(OP_CREATION_FAILED, "Failed to create ReduceMean descriptor")); - auto x = _attributes->x->to(device, device_id); - auto y = _attributes->y->to(device, device_id); + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); size_t workspace_size; CHECK_OR(infiniopGetReduceMeanWorkspaceSize(op_desc, &workspace_size), @@ -59,13 +60,13 @@ std::shared_ptr Test::run( CHECK_OR(infiniopReduceMean(op_desc, workspace, workspace_size, - y->data(), - x->data(), + output->data(), + input->data(), nullptr), return TEST_FAILED(OP_EXECUTION_FAILED, "ReduceMean execution failed")); try { - allClose(y, _attributes->ans, _rtol, _atol, _equal_nan); + allClose(output, _attributes->ans, _rtol, _atol, _equal_nan); } catch (const std::exception &e) { return TEST_FAILED(RESULT_INCORRECT, e.what()); } @@ -76,8 +77,8 @@ std::shared_ptr Test::run( [=]() { infiniopReduceMean(op_desc, workspace, workspace_size, - y->data(), - x->data(), + output->data(), + input->data(), nullptr); }, warm_ups, iterations); @@ -94,18 +95,18 @@ std::vector Test::attribute_names() { } std::vector Test::tensor_names() { - return {"x", "ans", "y"}; + return {"input", "ans", "output"}; } std::vector Test::output_names() { - return {"y"}; + return {"output"}; } std::string Test::toString() const { std::ostringstream oss; oss << op_name() << std::endl; - oss << "- x: " << _attributes->x->info() << std::endl; - oss << "- y: " << _attributes->y->info() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; oss << "- dim=" << _attributes->dim << std::endl; oss << std::scientific << std::setprecision(2); oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; diff --git a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh index c1dbc3fa8..a64d6cfcb 100644 --- a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh +++ b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh @@ -15,12 +15,13 @@ __device__ void ReduceMeanKernel( __shared__ Tcompute mean_; Tcompute sum_0 = op::common_cuda::reduce_op::sum(x, width, x_stride_w); if (threadIdx.x == 0) { - mean_ = sum_0/width; + // mean_ = sum_0/width; + *y = sum_0/width; } - __syncthreads(); + // __syncthreads(); // [Elementwise] Divide each element by the sum and store in shared memory - *y = mean_; + // *y = mean_; } #endif // __REDUCE_MEAN_KERNEL_CUH__ From c8592e865eba82f0546d773a102c790aa3ade86a Mon Sep 17 00:00:00 2001 From: S-hhhhh <2320230838@mail.nankai.edu.cn> Date: Sat, 16 Aug 2025 14:46:43 +0800 Subject: [PATCH 04/11] update reduce_mean --- .../ops/reduce_mean/cpu/reduce_mean_cpu.cc | 39 +++++--- src/infiniop/ops/reduce_mean/info.h | 94 ++++++++++++------- .../reduce_mean/metax/reduce_mean_metax.maca | 44 ++++----- .../ops/reduce_mean/nvidia/kernel.cuh | 10 +- .../reduce_mean/nvidia/reduce_mean_nvidia.cu | 39 ++++---- 5 files changed, 133 insertions(+), 93 deletions(-) diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc index 3682d7631..b56184ef5 100644 --- a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc +++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc @@ -21,25 +21,36 @@ infiniStatus_t Descriptor::create( template infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) { const size_t batch_size = info->shape[0]; - const size_t rows = info->shape[1]; - const size_t cols = info->shape[2]; // 最后一维(规约维度) + const size_t channels = info->shape[1]; + const size_t rows = info->shape[2]; + const size_t cols = info->shape[3]; // 规约维度 const ptrdiff_t y_batch_stride = info->y_strides[0]; - const ptrdiff_t y_row_stride = info->y_strides[1]; + const ptrdiff_t y_channel_stride = info->y_strides[1]; + const ptrdiff_t y_row_stride = info->y_strides[2]; const ptrdiff_t x_batch_stride = info->x_strides[0]; - const ptrdiff_t x_row_stride = info->x_strides[1]; - const ptrdiff_t x_col_stride = info->x_strides[2]; + const ptrdiff_t x_channel_stride = info->x_strides[1]; + const ptrdiff_t x_row_stride = info->x_strides[2]; + const ptrdiff_t x_col_stride = info->x_strides[3]; - #pragma omp parallel for collapse(2) + #pragma omp parallel for collapse(3) for (size_t batch = 0; batch < batch_size; ++batch) { - for (size_t row = 0; row < rows; ++row) { - const T* input_start = x + batch * x_batch_stride + row * x_row_stride; - T* output_ptr = y + batch * y_batch_stride + row * y_row_stride; - float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols; - if constexpr (std::is_same::value || std::is_same::value) { - *output_ptr = utils::cast(mean); - } else { - *output_ptr = mean; + for (size_t channel = 0; channel < channels; ++channel) { + for (size_t row = 0; row < rows; ++row) { + const T* input_start = x + batch * x_batch_stride + + channel * x_channel_stride + + row * x_row_stride; + T* output_ptr = y + batch * y_batch_stride + + channel * y_channel_stride + + row * y_row_stride; + + float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols; + + if constexpr (std::is_same::value || std::is_same::value) { + *output_ptr = utils::cast(mean); + } else { + *output_ptr = mean; + } } } } diff --git a/src/infiniop/ops/reduce_mean/info.h b/src/infiniop/ops/reduce_mean/info.h index 41785a082..428675989 100644 --- a/src/infiniop/ops/reduce_mean/info.h +++ b/src/infiniop/ops/reduce_mean/info.h @@ -29,50 +29,76 @@ class ReduceMeanInfo { CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE); } CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape()); - if(ndim == 3){ - std::vector shape = x_desc->shape(); - std::vector y_strides = y_desc->strides(); - std::vector x_strides = x_desc->strides(); - if (dim != 2){ - std::swap(shape[dim], shape[2]); - std::swap(y_strides[dim], y_strides[2]); - std::swap(x_strides[dim], x_strides[2]); - } + if (ndim > 4) return INFINI_STATUS_BAD_TENSOR_SHAPE; + else if (ndim == 0){ + std::vector shape = {1, 1, 1, 1}; + std::vector y_strides = {0, 0, 0, 0}; + std::vector x_strides = {0, 0, 0, 0}; return utils::Result(ReduceMeanInfo{ dtype, shape, y_strides, x_strides}); } - else if (ndim == 2){ + else{ std::vector shape = x_desc->shape(); std::vector y_strides = y_desc->strides(); std::vector x_strides = x_desc->strides(); - if (dim != 1){ - std::swap(shape[dim], shape[1]); - std::swap(y_strides[dim], y_strides[1]); - std::swap(x_strides[dim], x_strides[1]); + if (dim != (shape.size() - 1)){ + std::swap(shape[dim], shape[shape.size() - 1]); + std::swap(y_strides[dim], y_strides[shape.size() - 1]); + std::swap(x_strides[dim], x_strides[shape.size() - 1]); + } + while (shape.size() < 4){ + shape.insert(shape.begin(), 1); + y_strides.insert(y_strides.begin(), 0); + x_strides.insert(x_strides.begin(), 0); } - shape.insert(shape.begin(), 1); - y_strides.insert(y_strides.begin(), 0); - x_strides.insert(x_strides.begin(), 0); - return utils::Result(ReduceMeanInfo{ - dtype, shape, y_strides, x_strides}); - } - else if (ndim == 1){ - std::vector shape = {1, 1, (x_desc->shape())[0]}; - std::vector y_strides = {0, 0, (y_desc->strides())[0]}; - std::vector x_strides = {0, 0, (x_desc->strides())[0]}; - return utils::Result(ReduceMeanInfo{ - dtype, shape, y_strides, x_strides}); - } - else if (ndim == 0){ - std::vector shape = {1, 1, 1}; - std::vector y_strides = {0, 0, 0}; - std::vector x_strides = {0, 0, 0}; return utils::Result(ReduceMeanInfo{ dtype, shape, y_strides, x_strides}); } - else{ - return INFINI_STATUS_BAD_TENSOR_SHAPE; - } + + // if(ndim == 3){ + // std::vector shape = x_desc->shape(); + // std::vector y_strides = y_desc->strides(); + // std::vector x_strides = x_desc->strides(); + // if (dim != 2){ + // std::swap(shape[dim], shape[2]); + // std::swap(y_strides[dim], y_strides[2]); + // std::swap(x_strides[dim], x_strides[2]); + // } + // return utils::Result(ReduceMeanInfo{ + // dtype, shape, y_strides, x_strides}); + // } + // else if (ndim == 2){ + // std::vector shape = x_desc->shape(); + // std::vector y_strides = y_desc->strides(); + // std::vector x_strides = x_desc->strides(); + // if (dim != 1){ + // std::swap(shape[dim], shape[1]); + // std::swap(y_strides[dim], y_strides[1]); + // std::swap(x_strides[dim], x_strides[1]); + // } + // shape.insert(shape.begin(), 1); + // y_strides.insert(y_strides.begin(), 0); + // x_strides.insert(x_strides.begin(), 0); + // return utils::Result(ReduceMeanInfo{ + // dtype, shape, y_strides, x_strides}); + // } + // else if (ndim == 1){ + // std::vector shape = {1, 1, (x_desc->shape())[0]}; + // std::vector y_strides = {0, 0, (y_desc->strides())[0]}; + // std::vector x_strides = {0, 0, (x_desc->strides())[0]}; + // return utils::Result(ReduceMeanInfo{ + // dtype, shape, y_strides, x_strides}); + // } + // else if (ndim == 0){ + // std::vector shape = {1, 1, 1}; + // std::vector y_strides = {0, 0, 0}; + // std::vector x_strides = {0, 0, 0}; + // return utils::Result(ReduceMeanInfo{ + // dtype, shape, y_strides, x_strides}); + // } + // else{ + // return INFINI_STATUS_BAD_TENSOR_SHAPE; + // } } }; diff --git a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca index 9abd888f6..0eaf1c9cb 100644 --- a/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca +++ b/src/infiniop/ops/reduce_mean/metax/reduce_mean_metax.maca @@ -11,10 +11,10 @@ template INFINIOP_METAX_KERNEL ReduceMean( Tdata *y_, const Tdata *x_, - size_t batch, size_t height, size_t width, - ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, - ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { - ReduceMeanKernel(y_, x_, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h, x_stride_w); + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ReduceMeanKernel(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w); } namespace op::reduce_mean::metax { @@ -43,29 +43,29 @@ infiniStatus_t Descriptor::create( template infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, - size_t batch_size, size_t height, size_t width, - ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, - ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, + size_t batch_size, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, hcStream_t stream) { - dim3 grid(uint32_t(batch_size), uint32_t(height), 1); + dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); if (dtype == INFINI_DTYPE_F16) { ReduceMean <<>>((half *)y, (const half *)x, - batch_size, height, width, - y_stride_b, y_stride_h, - x_stride_b, x_stride_h, x_stride_w); + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); } else if (dtype == INFINI_DTYPE_BF16) { ReduceMean <<>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x, - batch_size, height, width, - y_stride_b, y_stride_h, - x_stride_b, x_stride_h, x_stride_w); + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); } else if (dtype == INFINI_DTYPE_F32) { ReduceMean <<>>((float *)y, (const float *)x, - batch_size, height, width, - y_stride_b, y_stride_h, - x_stride_b, x_stride_h, x_stride_w); + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); } else { return INFINI_STATUS_BAD_TENSOR_DTYPE; } @@ -79,12 +79,14 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, hcStream_t stream = (hcStream_t)stream_; if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { CHECK_STATUS(launchKernel( - y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], - _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream)); + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { CHECK_STATUS(launchKernel( - y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], - _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream)); + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else { return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; } diff --git a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh index a64d6cfcb..44fb2226e 100644 --- a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh +++ b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh @@ -4,12 +4,12 @@ template __device__ void ReduceMeanKernel( Tdata *y_, const Tdata *x_, - size_t batch, size_t height, size_t width, - ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, - ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { - Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_h; - const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_h; + Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h; + const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h; // [Reduce] Find the sum of each updated row and store in shared memory __shared__ Tcompute mean_; diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu index 170c956e9..1487e080f 100644 --- a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu +++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu @@ -11,10 +11,10 @@ template INFINIOP_CUDA_KERNEL ReduceMean( Tdata *y_, const Tdata *x_, - size_t batch, size_t height, size_t width, - ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, - ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { - ReduceMeanKernel(y_, x_, batch, height, width, y_stride_b, y_stride_h, x_stride_b, x_stride_h, x_stride_w); + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ReduceMeanKernel(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w); } namespace op::reduce_mean::nvidia { @@ -43,29 +43,29 @@ infiniStatus_t Descriptor::create( template infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, - size_t batch_size, size_t height, size_t width, - ptrdiff_t y_stride_b, ptrdiff_t y_stride_h, - ptrdiff_t x_stride_b, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, + size_t batch_size, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, cudaStream_t stream) { - dim3 grid(uint32_t(batch_size), uint32_t(height), 1); + dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); if (dtype == INFINI_DTYPE_F16) { ReduceMean <<>>((half *)y, (const half *)x, - batch_size, height, width, - y_stride_b, y_stride_h, - x_stride_b, x_stride_h, x_stride_w); + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); } else if (dtype == INFINI_DTYPE_BF16) { ReduceMean <<>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, - batch_size, height, width, - y_stride_b, y_stride_h, - x_stride_b, x_stride_h, x_stride_w); + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); } else if (dtype == INFINI_DTYPE_F32) { ReduceMean <<>>((float *)y, (const float *)x, - batch_size, height, width, - y_stride_b, y_stride_h, - x_stride_b, x_stride_h, x_stride_w); + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); } else { return INFINI_STATUS_BAD_TENSOR_DTYPE; } @@ -79,8 +79,9 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, cudaStream_t stream = (cudaStream_t)stream_; if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { CHECK_STATUS(launchKernel( - y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], - _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream)); + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { CHECK_STATUS(launchKernel( y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], From dfcdeb2116fe760ef5589f14c82795c7b50c1aa4 Mon Sep 17 00:00:00 2001 From: S-hhhhh <2320230838@mail.nankai.edu.cn> Date: Sat, 16 Aug 2025 16:38:08 +0800 Subject: [PATCH 05/11] =?UTF-8?q?=E6=B7=BB=E5=8A=A0ReduceMean,=E6=9B=B4?= =?UTF-8?q?=E6=96=B0ReduceMax?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- include/infiniop.h | 1 + include/infiniop/ops/reduce_max.h | 27 +++ scripts/python_test.py | 1 + src/infiniop-test/include/ops.hpp | 2 + src/infiniop-test/src/ops/reduce_max.cpp | 119 ++++++++++++ src/infiniop-test/src/ops/reduce_mean.cpp | 1 - .../ops/reduce_max/cpu/reduce_max_cpu.cc | 78 ++++++++ .../ops/reduce_max/cpu/reduce_max_cpu.h | 7 + src/infiniop/ops/reduce_max/info.h | 62 ++++++ .../ops/reduce_max/metax/reduce_max_metax.h | 8 + .../reduce_max/metax/reduce_max_metax.maca | 96 ++++++++++ src/infiniop/ops/reduce_max/nvidia/kernel.cuh | 23 +++ .../reduce_max/nvidia/reduce_max_nvidia.cu | 99 ++++++++++ .../reduce_max/nvidia/reduce_max_nvidia.cuh | 8 + src/infiniop/ops/reduce_max/operator.cc | 181 ++++++++++++++++++ src/infiniop/ops/reduce_max/reduce_max.h | 47 +++++ src/infiniop/ops/reduce_mean/info.h | 45 ----- src/infiniop/reduce/cuda/reduce.cuh | 25 ++- .../test_generate/testcases/reduce_max.py | 112 +++++++++++ .../test_generate/testcases/reduce_mean.py | 13 +- test/infiniop/libinfiniop/op_register.py | 32 ++++ test/infiniop/reduce_max.py | 150 +++++++++++++++ test/infiniop/reduce_mean.py | 6 +- 23 files changed, 1084 insertions(+), 59 deletions(-) create mode 100644 include/infiniop/ops/reduce_max.h create mode 100644 src/infiniop-test/src/ops/reduce_max.cpp create mode 100644 src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc create mode 100644 src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h create mode 100644 src/infiniop/ops/reduce_max/info.h create mode 100644 src/infiniop/ops/reduce_max/metax/reduce_max_metax.h create mode 100644 src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca create mode 100644 src/infiniop/ops/reduce_max/nvidia/kernel.cuh create mode 100644 src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu create mode 100644 src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh create mode 100644 src/infiniop/ops/reduce_max/operator.cc create mode 100644 src/infiniop/ops/reduce_max/reduce_max.h create mode 100644 test/infiniop-test/test_generate/testcases/reduce_max.py create mode 100644 test/infiniop/reduce_max.py diff --git a/include/infiniop.h b/include/infiniop.h index 85df0c703..f7ad9e02c 100644 --- a/include/infiniop.h +++ b/include/infiniop.h @@ -18,5 +18,6 @@ #include "infiniop/ops/swiglu.h" #include "infiniop/tensor_descriptor.h" #include "infiniop/ops/reduce_mean.h" +#include "infiniop/ops/reduce_max.h" #endif // __INFINIOP_API_H__ diff --git a/include/infiniop/ops/reduce_max.h b/include/infiniop/ops/reduce_max.h new file mode 100644 index 000000000..42a3dd62d --- /dev/null +++ b/include/infiniop/ops/reduce_max.h @@ -0,0 +1,27 @@ +#ifndef __INFINIOP_REDUCE_MAX_API_H__ +#define __INFINIOP_REDUCE_MAX_API_H__ + +#include "../operator_descriptor.h" + +typedef struct InfiniopDescriptor *infiniopReduceMaxDescriptor_t; + +__C __export infiniStatus_t infiniopCreateReduceMaxDescriptor( + infiniopHandle_t handle, + infiniopReduceMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + size_t dim); + +__C __export infiniStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, size_t *size); + +__C __export infiniStatus_t infiniopReduceMax( + infiniopReduceMaxDescriptor_t desc, + void *workspace, + size_t workspace_size, + void *output, + const void *input, + void *stream); + +__C __export infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc); + +#endif diff --git a/scripts/python_test.py b/scripts/python_test.py index 14588cd7d..89df98708 100644 --- a/scripts/python_test.py +++ b/scripts/python_test.py @@ -25,6 +25,7 @@ def run_tests(args): "sub.py", "swiglu.py", "reduce_mean.py", + "reduce_max.py", ]: result = subprocess.run( f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index c94ebac90..29e4e633e 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -17,6 +17,7 @@ DECLARE_INFINIOP_TEST(causal_softmax) DECLARE_INFINIOP_TEST(rearrange) DECLARE_INFINIOP_TEST(sub) DECLARE_INFINIOP_TEST(reduce_mean) +DECLARE_INFINIOP_TEST(reduce_max) #define REGISTER_INFINIOP_TEST(name) \ { \ @@ -45,6 +46,7 @@ DECLARE_INFINIOP_TEST(reduce_mean) REGISTER_INFINIOP_TEST(rearrange) \ REGISTER_INFINIOP_TEST(sub) \ REGISTER_INFINIOP_TEST(reduce_mean) \ + REGISTER_INFINIOP_TEST(reduce_max) \ } namespace infiniop_test { diff --git a/src/infiniop-test/src/ops/reduce_max.cpp b/src/infiniop-test/src/ops/reduce_max.cpp new file mode 100644 index 000000000..1b10d0e02 --- /dev/null +++ b/src/infiniop-test/src/ops/reduce_max.cpp @@ -0,0 +1,119 @@ +#include "ops.hpp" +#include "utils.hpp" +#include +#include +#include + +namespace infiniop_test::reduce_max { +struct Test::Attributes { + std::shared_ptr input; + std::shared_ptr output; + std::shared_ptr ans; + size_t dim; +}; + +std::shared_ptr Test::build( + std::unordered_map> attributes, + std::unordered_map> tensors, + double rtol, double atol, bool equal_nan) { + auto test = std::shared_ptr(new Test(rtol, atol, equal_nan)); + test->_attributes = new Attributes(); + + if (attributes.find("dim") == attributes.end() + || tensors.find("input") == tensors.end() + || tensors.find("ans") == tensors.end() + || tensors.find("output") == tensors.end()) { + throw std::runtime_error("Invalid Test: Missing attributes or tensors"); + } + + test->_attributes->dim = size_t(*reinterpret_cast(attributes["dim"].data())); + test->_attributes->ans = tensors["ans"]; + test->_attributes->input = tensors["input"]; + test->_attributes->output = tensors["output"]; + + return test; +} + +std::shared_ptr Test::run( + infiniopHandle_t handle, infiniDevice_t device, int device_id, + size_t warm_ups, size_t iterations) { + + infiniopReduceMaxDescriptor_t op_desc; + CHECK_OR(infiniopCreateReduceMaxDescriptor(handle, &op_desc, + _attributes->output->desc(), + _attributes->input->desc(), + _attributes->dim), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to create ReduceMax descriptor")); + + auto input = _attributes->input->to(device, device_id); + auto output = _attributes->output->to(device, device_id); + + size_t workspace_size; + CHECK_OR(infiniopGetReduceMaxWorkspaceSize(op_desc, &workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size")); + void *workspace = nullptr; + if (workspace_size > 0) { + CHECK_OR(infinirtMalloc(&workspace, workspace_size), + return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace")); + } + + CHECK_OR(infiniopReduceMax(op_desc, + workspace, workspace_size, + output->data(), + input->data(), + nullptr), + return TEST_FAILED(OP_EXECUTION_FAILED, "ReduceMax execution failed")); + + try { + allClose(output, _attributes->ans, _rtol, _atol, _equal_nan); + } catch (const std::exception &e) { + return TEST_FAILED(RESULT_INCORRECT, e.what()); + } + + double elapsed_time = 0.; + + elapsed_time = benchmark( + [=]() { + infiniopReduceMax(op_desc, + workspace, workspace_size, + output->data(), + input->data(), + nullptr); + }, + warm_ups, iterations); + + if (workspace != nullptr) { + infinirtFree(workspace); + } + + return TEST_PASSED(elapsed_time); +} + +std::vector Test::attribute_names() { + return {"dim"}; +} + +std::vector Test::tensor_names() { + return {"input", "ans", "output"}; +} + +std::vector Test::output_names() { + return {"output"}; +} + +std::string Test::toString() const { + std::ostringstream oss; + oss << op_name() << std::endl; + oss << "- input: " << _attributes->input->info() << std::endl; + oss << "- output: " << _attributes->output->info() << std::endl; + oss << "- dim=" << _attributes->dim << std::endl; + oss << std::scientific << std::setprecision(2); + oss << "- rtol=" << _rtol << ", atol=" << _atol << ", equal_nan=" << _equal_nan << std::endl; + return oss.str(); +} + +Test::~Test() { + delete _attributes; +} + +} // namespace infiniop_test::reduce_max diff --git a/src/infiniop-test/src/ops/reduce_mean.cpp b/src/infiniop-test/src/ops/reduce_mean.cpp index 114ab2981..4f1aa4bdd 100644 --- a/src/infiniop-test/src/ops/reduce_mean.cpp +++ b/src/infiniop-test/src/ops/reduce_mean.cpp @@ -23,7 +23,6 @@ std::shared_ptr Test::build( || tensors.find("input") == tensors.end() || tensors.find("ans") == tensors.end() || tensors.find("output") == tensors.end()) { - std::cout << "所有的键:" << std::endl; throw std::runtime_error("Invalid Test: Missing attributes or tensors"); } diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc new file mode 100644 index 000000000..409a4ca02 --- /dev/null +++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc @@ -0,0 +1,78 @@ +#include "reduce_max_cpu.h" +#include "../../../devices/cpu/common_cpu.h" +#include "../../../reduce/cpu/reduce.h" + +namespace op::reduce_max::cpu { + +Descriptor::~Descriptor() {} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto result = ReduceMaxInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(result); + *desc_ptr = new Descriptor(nullptr, result.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) { + const size_t batch_size = info->shape[0]; + const size_t channels = info->shape[1]; + const size_t rows = info->shape[2]; + const size_t cols = info->shape[3]; // 规约维度 + + const ptrdiff_t y_batch_stride = info->y_strides[0]; + const ptrdiff_t y_channel_stride = info->y_strides[1]; + const ptrdiff_t y_row_stride = info->y_strides[2]; + const ptrdiff_t x_batch_stride = info->x_strides[0]; + const ptrdiff_t x_channel_stride = info->x_strides[1]; + const ptrdiff_t x_row_stride = info->x_strides[2]; + const ptrdiff_t x_col_stride = info->x_strides[3]; + + #pragma omp parallel for collapse(3) + for (size_t batch = 0; batch < batch_size; ++batch) { + for (size_t channel = 0; channel < channels; ++channel) { + for (size_t row = 0; row < rows; ++row) { + const T* input_start = x + batch * x_batch_stride + + channel * x_channel_stride + + row * x_row_stride; + T* output_ptr = y + batch * y_batch_stride + + channel * y_channel_stride + + row * y_row_stride; + + float max = op::common_cpu::reduce_op::max(input_start, cols, x_col_stride); + + if constexpr (std::is_same::value || std::is_same::value) { + *output_ptr = utils::cast(max); + } else { + *output_ptr = max; + } + } + } + } + + return INFINI_STATUS_SUCCESS; +} +infiniStatus_t Descriptor::calculate( + void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream) const { + + if (_info.dtype == INFINI_DTYPE_F16) { + CHECK_STATUS(reduce_max(&_info, (fp16_t *)y, (const fp16_t *)x)); + } else if (_info.dtype == INFINI_DTYPE_BF16) { + CHECK_STATUS(reduce_max(&_info, (bf16_t *)y, (const bf16_t *)x)); + } else if (_info.dtype == INFINI_DTYPE_F32) { + CHECK_STATUS(reduce_max(&_info, (float *)y, (const float *)x)); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + + return INFINI_STATUS_SUCCESS; +} +} // namespace op::reduce_max::cpu diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h new file mode 100644 index 000000000..bf6be9b74 --- /dev/null +++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.h @@ -0,0 +1,7 @@ +#ifndef __REDUCE_MAX_CPU_H__ +#define __REDUCE_MAX_CPU_H__ +#include "../reduce_max.h" + +DESCRIPTOR(cpu) + +#endif diff --git a/src/infiniop/ops/reduce_max/info.h b/src/infiniop/ops/reduce_max/info.h new file mode 100644 index 000000000..0babca6e6 --- /dev/null +++ b/src/infiniop/ops/reduce_max/info.h @@ -0,0 +1,62 @@ +#ifndef __REDUCE_MAX_INFO_H__ +#define __REDUCE_MAX_INFO_H__ + +#include "../../../utils.h" +#include "../../tensor.h" +#include + +namespace op::reduce_max { + +class ReduceMaxInfo { + ReduceMaxInfo() = default; + +public: + infiniDtype_t dtype; + + std::vector shape; + std::vector y_strides; + std::vector x_strides; + + static utils::Result create(infiniopTensorDescriptor_t y_desc, infiniopTensorDescriptor_t x_desc, size_t dim) { + auto dtype = y_desc->dtype(); + if (dtype != x_desc->dtype()) { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_F32); + + size_t ndim = y_desc->ndim(); + if (x_desc->ndim() != ndim) { + CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE); + } + CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape()); + if (ndim > 4) return INFINI_STATUS_BAD_TENSOR_SHAPE; + else if (ndim == 0){ + std::vector shape = {1, 1, 1, 1}; + std::vector y_strides = {0, 0, 0, 0}; + std::vector x_strides = {0, 0, 0, 0}; + return utils::Result(ReduceMaxInfo{ + dtype, shape, y_strides, x_strides}); + } + else{ + std::vector shape = x_desc->shape(); + std::vector y_strides = y_desc->strides(); + std::vector x_strides = x_desc->strides(); + if (dim != (shape.size() - 1)){ + std::swap(shape[dim], shape[shape.size() - 1]); + std::swap(y_strides[dim], y_strides[shape.size() - 1]); + std::swap(x_strides[dim], x_strides[shape.size() - 1]); + } + while (shape.size() < 4){ + shape.insert(shape.begin(), 1); + y_strides.insert(y_strides.begin(), 0); + x_strides.insert(x_strides.begin(), 0); + } + return utils::Result(ReduceMaxInfo{ + dtype, shape, y_strides, x_strides}); + } + } +}; + +} // namespace op::reduce_max + +#endif // __REDUCE_MAX_INFO_H__ diff --git a/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h new file mode 100644 index 000000000..735bc8da4 --- /dev/null +++ b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.h @@ -0,0 +1,8 @@ +#ifndef __REDUCE_MAX_METAX_H__ +#define __REDUCE_MAX_METAX_H__ + +#include "../reduce_max.h" + +DESCRIPTOR(metax) + +#endif diff --git a/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca new file mode 100644 index 000000000..ab120fea5 --- /dev/null +++ b/src/infiniop/ops/reduce_max/metax/reduce_max_metax.maca @@ -0,0 +1,96 @@ +#include "../../../devices/metax/metax_common.h" +#include "reduce_max_metax.h" + +#include +#include "../../../devices/metax/metax_kernel_common.h" + +#include "../../../reduce/cuda/reduce.cuh" + +#include "../nvidia/kernel.cuh" + +template +INFINIOP_METAX_KERNEL ReduceMax( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ReduceMaxKernel(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w); +} + +namespace op::reduce_max::metax { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto info = ReduceMaxInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(info); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t batch_size, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, + hcStream_t stream) { + dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); + if (dtype == INFINI_DTYPE_F16) { + ReduceMax + <<>>((half *)y, (const half *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_BF16) { + ReduceMax + <<>>((__hpcc_bfloat16 *)y, (const __hpcc_bfloat16 *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_F32) { + ReduceMax + <<>>((float *)y, (const float *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream_) const { + hcStream_t stream = (hcStream_t)stream_; + if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == METAX_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::reduce_max::metax diff --git a/src/infiniop/ops/reduce_max/nvidia/kernel.cuh b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh new file mode 100644 index 000000000..51470d75f --- /dev/null +++ b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh @@ -0,0 +1,23 @@ +#ifndef __REDUCE_MAX_KERNEL_CUH__ +#define __REDUCE_MAX_KERNEL_CUH__ + +template +__device__ void ReduceMaxKernel( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + + Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h; + const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h; + + // [Reduce] Find the max of each updated row and store in shared memory + __shared__ Tcompute max_; + Tcompute max_0 = op::common_cuda::reduce_op::max(x, width, x_stride_w); + if (threadIdx.x == 0) { + *y = max_0; + } + +} + +#endif // __REDUCE_MAX_KERNEL_CUH__ diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu new file mode 100644 index 000000000..e95ea69f8 --- /dev/null +++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu @@ -0,0 +1,99 @@ +#include "../../../devices/nvidia/nvidia_common.cuh" +#include "reduce_max_nvidia.cuh" + +#include "../../../devices/nvidia/nvidia_kernel_common.cuh" +#include + +#include "../../../reduce/cuda/reduce.cuh" + +#include "kernel.cuh" + +template +INFINIOP_CUDA_KERNEL ReduceMax( + Tdata *y_, const Tdata *x_, + size_t batch, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ReduceMaxKernel(y_, x_, batch, channels, height, width, y_stride_b, y_stride_c, y_stride_h, x_stride_b, x_stride_c, x_stride_h, x_stride_w); +} + +namespace op::reduce_max::nvidia { + +struct Descriptor::Opaque { + std::shared_ptr internal; +}; + +Descriptor::~Descriptor() { + delete _opaque; +} + +infiniStatus_t Descriptor::create( + infiniopHandle_t handle, + Descriptor **desc_ptr, + infiniopTensorDescriptor_t y_desc, + infiniopTensorDescriptor_t x_desc, + size_t dim) { + auto info = ReduceMaxInfo::create(y_desc, x_desc, dim); + CHECK_RESULT(info); + *desc_ptr = new Descriptor( + new Opaque{reinterpret_cast(handle)->internal()}, + info.take(), 0, handle->device, handle->device_id); + return INFINI_STATUS_SUCCESS; +} + +template +infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, + size_t batch_size, size_t channels, size_t height, size_t width, + ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, + cudaStream_t stream) { + dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); + if (dtype == INFINI_DTYPE_F16) { + ReduceMax + <<>>((half *)y, (const half *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_BF16) { + ReduceMax + <<>>((__nv_bfloat16 *)y, (const __nv_bfloat16 *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else if (dtype == INFINI_DTYPE_F32) { + ReduceMax + <<>>((float *)y, (const float *)x, + batch_size, channels, height, width, + y_stride_b, y_stride_c, y_stride_h, + x_stride_b, x_stride_c, x_stride_h, x_stride_w); + } else { + return INFINI_STATUS_BAD_TENSOR_DTYPE; + } + return INFINI_STATUS_SUCCESS; +} + +infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, + void *y, + const void *x, + void *stream_) const { + cudaStream_t stream = (cudaStream_t)stream_; + if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], + _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream)); + } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { + CHECK_STATUS(launchKernel( + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], + _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream)); + } else { + return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; + } + return INFINI_STATUS_SUCCESS; +} + +} // namespace op::reduce_max::nvidia diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh new file mode 100644 index 000000000..388738c27 --- /dev/null +++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cuh @@ -0,0 +1,8 @@ +#ifndef __REDUCE_MAX_NVIDIA_H__ +#define __REDUCE_MAX_NVIDIA_H__ + +#include "../reduce_max.h" + +DESCRIPTOR(nvidia) + +#endif diff --git a/src/infiniop/ops/reduce_max/operator.cc b/src/infiniop/ops/reduce_max/operator.cc new file mode 100644 index 000000000..c22ebeb1b --- /dev/null +++ b/src/infiniop/ops/reduce_max/operator.cc @@ -0,0 +1,181 @@ +#include "../../operator.h" +#include "../../handle.h" +#include "infiniop/ops/reduce_max.h" + +#ifdef ENABLE_CPU_API +#include "cpu/reduce_max_cpu.h" +#endif +#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) +#include "nvidia/reduce_max_nvidia.cuh" +#endif +#ifdef ENABLE_METAX_API +#include "metax/reduce_max_metax.h" +#endif +#ifdef ENABLE_ASCEND_API +#include "ascend/reduce_max_ascend.h" +#endif + +__C infiniStatus_t infiniopCreateReduceMaxDescriptor( + infiniopHandle_t handle, + infiniopReduceMaxDescriptor_t *desc_ptr, + infiniopTensorDescriptor_t output_desc, + infiniopTensorDescriptor_t input_desc, + size_t dim) { + +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::reduce_max::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ + dim); + + switch (handle->device) { +#ifdef ENABLE_CPU_API + CREATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CREATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CREATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CREATE(INFINI_DEVICE_METAX, metax) +#endif +// #ifdef ENABLE_ASCEND_API +// CREATE(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_CAMBRICON_MLU +// case DevCambriconMlu: { +// return bangCreateReduceMaxDescriptor((BangHandle_t)handle, (ReduceMaxBangDescriptor_t *)desc_ptr, output_desc, input_desc, dim); +// // return cnnlCreateReduceMaxDescriptor((BangHandle_t) handle, (ReduceMaxCnnlDescriptor_t *) desc_ptr, output_desc, input_desc, dim); +// } +// #endif +// #ifdef ENABLE_MTHREADS_GPU +// case DevMthreadsGpu: { +// return musaCreateReduceMaxDescriptor((MusaHandle_t)handle, (ReduceMaxMusaDescriptor_t *)desc_ptr, output_desc, input_desc, dim); +// } +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor_t desc, size_t *size) { + +#define GET(CASE, NAMESPACE) \ + case CASE: \ + *size = reinterpret_cast(desc)->workspaceSize(); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + GET(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + GET(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + GET(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + GET(INFINI_DEVICE_METAX, metax) +#endif +// #ifdef ENABLE_ASCEND_API +// GET(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_CAMBRICON_MLU +// case DevCambriconMlu: { +// return bangGetReduceMaxWorkspaceSize((ReduceMaxBangDescriptor_t)desc, size); +// // return cnnlGetReduceMaxWorkspaceSize((ReduceMaxCnnlDescriptor_t) desc, size); +// } +// #endif +// #ifdef ENABLE_MTHREADS_GPU +// case DevMthreadsGpu: { +// return musaGetReduceMaxWorkspaceSize((ReduceMaxMusaDescriptor_t)desc, size); +// } +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopReduceMax( + infiniopReduceMaxDescriptor_t desc, + void *workspace, size_t workspace_size, + void *output, + const void *input, + void *stream) { + +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ + workspace, workspace_size, output, input, stream); + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + CALCULATE(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + CALCULATE(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + CALCULATE(INFINI_DEVICE_METAX, metax) +#endif +// #ifdef ENABLE_ASCEND_API +// CALCULATE(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_CAMBRICON_MLU +// case DevCambriconMlu: { +// return bangReduceMax((ReduceMaxBangDescriptor_t)desc, workspace, workspace_size, output, input, stream); +// // return cnnlReduceMax((ReduceMaxCnnlDescriptor_t) desc, workspace, workspace_size, output, input, stream); +// } +// #endif +// #ifdef ENABLE_MTHREADS_GPU +// case DevMthreadsGpu: { +// return musaReduceMax((ReduceMaxMusaDescriptor_t)desc, workspace, workspace_size, output, input, stream); +// } +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} + +__C infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc) { + +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ + return INFINI_STATUS_SUCCESS; + + switch (desc->device_type) { +#ifdef ENABLE_CPU_API + DESTROY(INFINI_DEVICE_CPU, cpu) +#endif +#ifdef ENABLE_NVIDIA_API + DESTROY(INFINI_DEVICE_NVIDIA, nvidia) +#endif +#ifdef ENABLE_ILUVATAR_API + DESTROY(INFINI_DEVICE_ILUVATAR, nvidia); +#endif +#ifdef ENABLE_METAX_API + DESTROY(INFINI_DEVICE_METAX, metax) +#endif +// #ifdef ENABLE_ASCEND_API +// DESTROY(INFINI_DEVICE_ASCEND, ascend) +// #endif +// #ifdef ENABLE_CAMBRICON_MLU +// case DevCambriconMlu: { +// return bangDestroyReduceMaxDescriptor((ReduceMaxBangDescriptor_t)desc); +// // return cnnlDestroyReduceMaxDescriptor((ReduceMaxCnnlDescriptor_t) desc); +// } +// #endif +// #ifdef ENABLE_MTHREADS_GPU +// case DevMthreadsGpu: +// return musaDestroyReduceMaxDescriptor((ReduceMaxMusaDescriptor_t)desc); +// #endif + } + return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; +} \ No newline at end of file diff --git a/src/infiniop/ops/reduce_max/reduce_max.h b/src/infiniop/ops/reduce_max/reduce_max.h new file mode 100644 index 000000000..412f67162 --- /dev/null +++ b/src/infiniop/ops/reduce_max/reduce_max.h @@ -0,0 +1,47 @@ +#ifndef REDUCE_MAX_H +#define REDUCE_MAX_H + +#include "../../operator.h" +#include "info.h" + +#define DESCRIPTOR(NAMESPACE) \ + \ + namespace op::reduce_max::NAMESPACE { \ + class Descriptor final : public InfiniopDescriptor { \ + struct Opaque; \ + Opaque *_opaque; \ + ReduceMaxInfo _info; \ + size_t _workspace_size; \ + \ + Descriptor( \ + Opaque *opaque, \ + ReduceMaxInfo info, \ + size_t workspace_size, \ + infiniDevice_t device_type, \ + int device_id) \ + : InfiniopDescriptor{device_type, device_id}, \ + _opaque(opaque), \ + _info(info), \ + _workspace_size(workspace_size) {} \ + \ + public: \ + ~Descriptor(); \ + \ + size_t workspaceSize() const { return _workspace_size; } \ + \ + static infiniStatus_t create( \ + infiniopHandle_t handle, \ + Descriptor **desc_ptr, \ + infiniopTensorDescriptor_t output_desc, \ + infiniopTensorDescriptor_t input_desc, \ + size_t dim); \ + \ + infiniStatus_t calculate( \ + void *workspace, size_t workspace_size, \ + void *output, \ + const void *input, \ + void *stream) const; \ + }; \ + } + +#endif // REDUCE_MAX_H \ No newline at end of file diff --git a/src/infiniop/ops/reduce_mean/info.h b/src/infiniop/ops/reduce_mean/info.h index 428675989..a47c81133 100644 --- a/src/infiniop/ops/reduce_mean/info.h +++ b/src/infiniop/ops/reduce_mean/info.h @@ -54,51 +54,6 @@ class ReduceMeanInfo { return utils::Result(ReduceMeanInfo{ dtype, shape, y_strides, x_strides}); } - - // if(ndim == 3){ - // std::vector shape = x_desc->shape(); - // std::vector y_strides = y_desc->strides(); - // std::vector x_strides = x_desc->strides(); - // if (dim != 2){ - // std::swap(shape[dim], shape[2]); - // std::swap(y_strides[dim], y_strides[2]); - // std::swap(x_strides[dim], x_strides[2]); - // } - // return utils::Result(ReduceMeanInfo{ - // dtype, shape, y_strides, x_strides}); - // } - // else if (ndim == 2){ - // std::vector shape = x_desc->shape(); - // std::vector y_strides = y_desc->strides(); - // std::vector x_strides = x_desc->strides(); - // if (dim != 1){ - // std::swap(shape[dim], shape[1]); - // std::swap(y_strides[dim], y_strides[1]); - // std::swap(x_strides[dim], x_strides[1]); - // } - // shape.insert(shape.begin(), 1); - // y_strides.insert(y_strides.begin(), 0); - // x_strides.insert(x_strides.begin(), 0); - // return utils::Result(ReduceMeanInfo{ - // dtype, shape, y_strides, x_strides}); - // } - // else if (ndim == 1){ - // std::vector shape = {1, 1, (x_desc->shape())[0]}; - // std::vector y_strides = {0, 0, (y_desc->strides())[0]}; - // std::vector x_strides = {0, 0, (x_desc->strides())[0]}; - // return utils::Result(ReduceMeanInfo{ - // dtype, shape, y_strides, x_strides}); - // } - // else if (ndim == 0){ - // std::vector shape = {1, 1, 1}; - // std::vector y_strides = {0, 0, 0}; - // std::vector x_strides = {0, 0, 0}; - // return utils::Result(ReduceMeanInfo{ - // dtype, shape, y_strides, x_strides}); - // } - // else{ - // return INFINI_STATUS_BAD_TENSOR_SHAPE; - // } } }; diff --git a/src/infiniop/reduce/cuda/reduce.cuh b/src/infiniop/reduce/cuda/reduce.cuh index 0a30a3a26..b69b46163 100644 --- a/src/infiniop/reduce/cuda/reduce.cuh +++ b/src/infiniop/reduce/cuda/reduce.cuh @@ -63,10 +63,9 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) { // Sum(x) on non-contiguous data of length count template -__device__ __forceinline__ Tcompute sum( - const Tdata *data_ptr, - size_t count, - ptrdiff_t stride) +__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, + size_t count, + ptrdiff_t stride) { Tcompute s = 0; @@ -79,6 +78,24 @@ __device__ __forceinline__ Tcompute sum( return BlockReduce(temp_storage).Sum(s); } + +// Max(x) on non-contiguous data of length count +template +__device__ __forceinline__ Tdata max(const Tdata *data_ptr, + size_t count, + ptrdiff_t stride) +{ + Tdata max_ = data_ptr[0]; + + for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { + max_ = cub::Max()(max_, data_ptr[i * stride]); + } + + using BlockReduce = cub::BlockReduce; + __shared__ typename BlockReduce::TempStorage temp_storage; + + return BlockReduce(temp_storage).Reduce(max_, cub::Max(), BLOCK_SIZE); +} } // namespace op::common_cuda::reduce_op #endif diff --git a/test/infiniop-test/test_generate/testcases/reduce_max.py b/test/infiniop-test/test_generate/testcases/reduce_max.py new file mode 100644 index 000000000..487da489a --- /dev/null +++ b/test/infiniop-test/test_generate/testcases/reduce_max.py @@ -0,0 +1,112 @@ +from ast import List +import numpy as np +import gguf +from typing import List +from enum import Enum, auto + +from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides + + +def reduce_max(x, dim): + if isinstance(x, np.float64): + return x + return x.max(axis=dim, keepdims=True) + + +def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray: + return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001 + + +class ReduceMaxTestCase(InfiniopTestCase): + def __init__( + self, + x: np.ndarray, + y: np.ndarray, + shape_x: List[int] | None, + shape_y: List[int] | None, + stride_x: List[int] | None, + stride_y: List[int] | None, + dim: int = 0, + ): + super().__init__("reduce_max") + self.x = x + self.y = y + self.shape_x=shape_x + self.shape_y=shape_y + self.stride_x = stride_x + self.stride_y = stride_y + self.dim = dim + + def write_test(self, test_writer: "InfiniopTestWriter"): + super().write_test(test_writer) + print(self.shape_y, self.shape_x, self.stride_y, self.stride_x, self.dim) + if self.shape_x is not None: + test_writer.add_array(test_writer.gguf_key("input.shape"), self.shape_x) + if self.shape_y is not None: + test_writer.add_array(test_writer.gguf_key("output.shape"), self.shape_y) + if self.stride_x is not None: + test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x)) + test_writer.add_array( + test_writer.gguf_key("output.strides"), + gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y)) + ) + test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim) + test_writer.add_tensor( + test_writer.gguf_key("input"), + self.x, + raw_dtype=np_dtype_to_ggml(self.x.dtype), + ) + test_writer.add_tensor( + test_writer.gguf_key("output"), + self.y, + raw_dtype=np_dtype_to_ggml(self.y.dtype), + ) + ans = reduce_max( + self.x.astype(np.float64), self.dim + ) + test_writer.add_tensor( + test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64 + ) + + +if __name__ == "__main__": + test_writer = InfiniopTestWriter("reduce_max.gguf") + test_cases = [] + # ============================================================================== + # Configuration + # ============================================================================== + # These are not maxt to be imported from other modules + _TEST_CASES_ = [ + # y_shape, x_shape, y_stride, x_stride, dim + # ((0,), (0,), (0,), (0,), 0), + ((1, ), (32, ), None, None, 0), + ((1, 4), (1, 4), None, None, 0), + ((1, 1), (1, 4), None, None, 1), + ((16, 1), (16, 2048), None, None, 1), + ((1, 16), (2048, 16), None, None, 0), + ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1), + ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0), + ((4, 4, 1), (4, 4, 2048), None, None, 2), + ((1, 4, 4), (2048, 4, 4), None, None, 0), + ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), + ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0), +] + _TENSOR_DTYPES_ = [np.float16, np.float32] + + for dtype in _TENSOR_DTYPES_: + for shape_y, shape_x, stride_y, stride_x, dim in _TEST_CASES_: + x = random_tensor(shape_x, dtype) + y = np.empty(tuple(0 for _ in shape_y), dtype=dtype) + test_case = ReduceMaxTestCase( + x, + y, + shape_x, + shape_y, + stride_x, + stride_y, + dim, + ) + test_cases.append(test_case) + + test_writer.add_tests(test_cases) + test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/reduce_mean.py b/test/infiniop-test/test_generate/testcases/reduce_mean.py index 0e05d4240..74421d83d 100644 --- a/test/infiniop-test/test_generate/testcases/reduce_mean.py +++ b/test/infiniop-test/test_generate/testcases/reduce_mean.py @@ -41,23 +41,23 @@ def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) print(self.shape_y, self.shape_x, self.stride_y, self.stride_x, self.dim) if self.shape_x is not None: - test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x) + test_writer.add_array(test_writer.gguf_key("input.shape"), self.shape_x) if self.shape_y is not None: - test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y) + test_writer.add_array(test_writer.gguf_key("output.shape"), self.shape_y) if self.stride_x is not None: - test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)) + test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x)) test_writer.add_array( - test_writer.gguf_key("y.strides"), + test_writer.gguf_key("output.strides"), gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y)) ) test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim) test_writer.add_tensor( - test_writer.gguf_key("x"), + test_writer.gguf_key("input"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype), ) test_writer.add_tensor( - test_writer.gguf_key("y"), + test_writer.gguf_key("output"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype), ) @@ -89,6 +89,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): ((4, 4, 1), (4, 4, 2048), None, None, 2), ((1, 4, 4), (2048, 4, 4), None, None, 0), ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), + ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0), ] _TENSOR_DTYPES_ = [np.float16, np.float32] diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 7f147a8b2..4e3ad70db 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -520,4 +520,36 @@ def reduce_mean_(lib): lib.infiniopDestroyReduceMeanDescriptor.restype = c_int32 lib.infiniopDestroyReduceMeanDescriptor.argtypes = [ infiniopOperatorDescriptor_t, + ] + +@OpRegister.operator +def reduce_max_(lib): + lib.infiniopCreateReduceMaxDescriptor.restype = c_int32 + lib.infiniopCreateReduceMaxDescriptor.argtypes = [ + infiniopHandle_t, + POINTER(infiniopOperatorDescriptor_t), + infiniopTensorDescriptor_t, + infiniopTensorDescriptor_t, + c_size_t, + ] + + lib.infiniopGetReduceMaxWorkspaceSize.restype = c_int32 + lib.infiniopGetReduceMaxWorkspaceSize.argtypes = [ + infiniopOperatorDescriptor_t, + POINTER(c_size_t), + ] + + lib.infiniopReduceMax.restype = c_int32 + lib.infiniopReduceMax.argtypes = [ + infiniopOperatorDescriptor_t, + c_void_p, + c_size_t, + c_void_p, + c_void_p, + c_void_p, + ] + + lib.infiniopDestroyReduceMaxDescriptor.restype = c_int32 + lib.infiniopDestroyReduceMaxDescriptor.argtypes = [ + infiniopOperatorDescriptor_t, ] \ No newline at end of file diff --git a/test/infiniop/reduce_max.py b/test/infiniop/reduce_max.py new file mode 100644 index 000000000..8e27ae484 --- /dev/null +++ b/test/infiniop/reduce_max.py @@ -0,0 +1,150 @@ +import torch +import ctypes +from ctypes import c_uint64 +from libinfiniop import ( + LIBINFINIOP, + TestTensor, + get_test_devices, + check_error, + test_operator, + get_args, + debug, + get_tolerance, + profile_operation, + TestWorkspace, + InfiniDtype, + InfiniDtypeNames, + InfiniDeviceNames, + infiniopOperatorDescriptor_t, +) + +# ============================================================================== +# Configuration (Internal Use Only) +# ============================================================================== +# These are not maxt to be imported from other modules +_TEST_CASES_ = [ + # y_shape, x_shape, y_stride, x_stride, dim + ((), (), None, None, 0), + ((1, ), (32, ), None, None, 0), + ((1, 4), (1, 4), None, None, 0), + ((1, 1), (1, 4), None, None, 1), + ((16, 1), (16, 2048), None, None, 1), + ((1, 16), (2048, 16), None, None, 0), + ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1), + ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0), + ((4, 4, 1), (4, 4, 2048), None, None, 2), + ((1, 4, 4), (2048, 4, 4), None, None, 0), + ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), +] + +# x types used for testing +_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32] + +_TEST_CASES = _TEST_CASES_ + +# Tolerance map for different data types +_TOLERANCE_MAP = { + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, +} + +DEBUG = False +PROFILE = False +NUM_PRERUN = 10 +NUM_ITERATIONS = 1000 + + +def reduce_max(x, dim): + return x.max(dim=dim,keepdim=True)[0] + + +def test( + handle, + device, + y_shape, + x_shape, + y_stride, + x_stride, + dim, + dtype=InfiniDtype.F16, + sync=None, +): + print( + f"Testing Reduce_Max on {InfiniDeviceNames[device]} with y_shape:{y_shape} x_shape:{x_shape}" + f" y_stride:{y_stride} x_stride:{x_stride} dim:{dim} dtype:{InfiniDtypeNames[dtype]}" + ) + + x = TestTensor(x_shape, x_stride, dtype, device) + ans = reduce_max(x.torch_tensor(), dim) + + y = TestTensor(y_shape, y_stride, dtype, device) + + if sync is not None: + sync() + + descriptor = infiniopOperatorDescriptor_t() + check_error( + LIBINFINIOP.infiniopCreateReduceMaxDescriptor( + handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, ctypes.c_size_t(dim) + ) + ) + + # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel + x.destroy_desc() + y.destroy_desc() + + workspace_size = c_uint64(0) + check_error( + LIBINFINIOP.infiniopGetReduceMaxWorkspaceSize( + descriptor, ctypes.byref(workspace_size) + ) + ) + workspace = TestWorkspace(workspace_size.value, x.device) + + def lib_reduce_max(): + check_error( + LIBINFINIOP.infiniopReduceMax( + descriptor, + workspace.data(), + workspace_size.value, + y.data(), + x.data(), + None, + ) + ) + + lib_reduce_max() + + if sync is not None: + sync() + + atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype) + if DEBUG: + debug(y.actual_tensor(), ans, atol=atol, rtol=rtol) + assert torch.allclose(y.actual_tensor(), ans, atol=atol, rtol=rtol) + + # Profiling workflow + if PROFILE: + # fmt: off + profile_operation("PyTorch", lambda: causal_softmax(x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS) + profile_operation(" lib", lambda: lib_causal_softmax(), device, NUM_PRERUN, NUM_ITERATIONS) + # fmt: on + + check_error(LIBINFINIOP.infiniopDestroyReduceMaxDescriptor(descriptor)) + + +if __name__ == "__main__": + args = get_args() + + # Configure testing options + DEBUG = args.debug + PROFILE = args.profile + NUM_PRERUN = args.num_prerun + NUM_ITERATIONS = args.num_iterations + + # Execute tests + for device in get_test_devices(args): + test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES) + + print("\033[92mTest passed!\033[0m") diff --git a/test/infiniop/reduce_mean.py b/test/infiniop/reduce_mean.py index e56663e44..3b752b671 100644 --- a/test/infiniop/reduce_mean.py +++ b/test/infiniop/reduce_mean.py @@ -44,9 +44,9 @@ # Tolerance map for different data types _TOLERANCE_MAP = { - InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2}, - InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2}, - InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5}, + InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3}, + InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2}, + InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4}, } DEBUG = False From e547f3e5e9f9dfea856879d1f1b5352828b6be25 Mon Sep 17 00:00:00 2001 From: S-hhhhh <2320230838@mail.nankai.edu.cn> Date: Sun, 17 Aug 2025 10:43:17 +0000 Subject: [PATCH 06/11] fix bugs --- .../ops/reduce_max/nvidia/reduce_max_nvidia.cu | 10 ++++++---- .../ops/reduce_mean/nvidia/reduce_mean_nvidia.cu | 10 ++++++---- 2 files changed, 12 insertions(+), 8 deletions(-) diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu index e95ea69f8..3aa8e5ff5 100644 --- a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu +++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu @@ -84,12 +84,14 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { CHECK_STATUS(launchKernel( - y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], - _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream)); + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { CHECK_STATUS(launchKernel( - y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], - _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream)); + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else { return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; } diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu index 1487e080f..26fd4e8ee 100644 --- a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu +++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu @@ -84,12 +84,14 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { CHECK_STATUS(launchKernel( - y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], - _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream)); + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { CHECK_STATUS(launchKernel( - y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], - _info.y_strides[0], _info.y_strides[1], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], stream)); + y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else { return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; } From 9324626942a5bff8a5cec1c1e85548002581ecb7 Mon Sep 17 00:00:00 2001 From: S-hhhhh <2320230838@mail.nankai.edu.cn> Date: Sun, 17 Aug 2025 14:05:12 +0000 Subject: [PATCH 07/11] update --- src/infiniop/ops/reduce_mean/nvidia/kernel.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh index 44fb2226e..54914d98a 100644 --- a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh +++ b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh @@ -12,7 +12,6 @@ __device__ void ReduceMeanKernel( const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h; // [Reduce] Find the sum of each updated row and store in shared memory - __shared__ Tcompute mean_; Tcompute sum_0 = op::common_cuda::reduce_op::sum(x, width, x_stride_w); if (threadIdx.x == 0) { // mean_ = sum_0/width; From 095bb1873091763f8f215d5c7d353c9e064c20f0 Mon Sep 17 00:00:00 2001 From: S-hhhhh <2320230838@mail.nankai.edu.cn> Date: Sun, 17 Aug 2025 14:05:56 +0000 Subject: [PATCH 08/11] update --- src/infiniop/ops/reduce_max/nvidia/kernel.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/src/infiniop/ops/reduce_max/nvidia/kernel.cuh b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh index 51470d75f..864db1047 100644 --- a/src/infiniop/ops/reduce_max/nvidia/kernel.cuh +++ b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh @@ -12,7 +12,6 @@ __device__ void ReduceMaxKernel( const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h; // [Reduce] Find the max of each updated row and store in shared memory - __shared__ Tcompute max_; Tcompute max_0 = op::common_cuda::reduce_op::max(x, width, x_stride_w); if (threadIdx.x == 0) { *y = max_0; From fb6ff6bf6e92761eed6806a2e37354164d4bb13d Mon Sep 17 00:00:00 2001 From: S-hhhhh <2320230838@mail.nankai.edu.cn> Date: Sun, 24 Aug 2025 15:31:23 +0800 Subject: [PATCH 09/11] formated --- src/infiniop-test/include/ops.hpp | 31 ++-- src/infiniop-test/include/test.hpp | 70 ++++----- src/infiniop-test/src/main.cpp | 7 +- src/infiniop-test/src/ops/reduce_max.cpp | 22 +-- src/infiniop-test/src/ops/reduce_mean.cpp | 22 +-- src/infiniop-test/src/test.cpp | 11 +- .../ops/reduce_max/cpu/reduce_max_cpu.cc | 22 +-- src/infiniop/ops/reduce_max/info.h | 12 +- src/infiniop/ops/reduce_max/nvidia/kernel.cuh | 3 +- .../reduce_max/nvidia/reduce_max_nvidia.cu | 8 +- src/infiniop/ops/reduce_max/operator.cc | 136 +++++++++--------- src/infiniop/ops/reduce_max/reduce_max.h | 2 +- .../ops/reduce_mean/cpu/reduce_mean_cpu.cc | 22 +-- src/infiniop/ops/reduce_mean/info.h | 12 +- .../ops/reduce_mean/nvidia/kernel.cuh | 4 +- .../reduce_mean/nvidia/reduce_mean_nvidia.cu | 8 +- src/infiniop/ops/reduce_mean/operator.cc | 132 ++++++++--------- src/infiniop/ops/reduce_mean/reduce_mean.h | 6 +- src/infiniop/reduce/cuda/reduce.cuh | 12 +- src/infinirt/infinirt_impl.h | 3 +- src/utils.h | 2 +- src/utils/check.h | 32 ++--- test/infiniop-test/test_generate/__init__.py | 9 +- .../test_generate/testcases/add.py | 30 ++-- .../test_generate/testcases/causal_softmax.py | 26 +++- .../test_generate/testcases/clip.py | 87 ++++++----- .../test_generate/testcases/mul.py | 53 ++++--- .../test_generate/testcases/rearrange.py | 70 ++++++--- .../test_generate/testcases/reduce_max.py | 58 +++++--- .../test_generate/testcases/reduce_mean.py | 58 +++++--- .../test_generate/testcases/rms_norm.py | 42 ++++-- .../test_generate/testcases/rope.py | 44 ++++-- .../test_generate/testcases/swiglu.py | 29 +++- test/infiniop/libinfiniop/op_register.py | 5 +- test/infiniop/reduce_max.py | 12 +- test/infiniop/reduce_mean.py | 10 +- 36 files changed, 646 insertions(+), 466 deletions(-) diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index 29e4e633e..bd3a3fd38 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -32,22 +32,21 @@ DECLARE_INFINIOP_TEST(reduce_max) /* * Register all the tests here */ -#define TEST_BUILDER_MAPPINGS \ - { \ - REGISTER_INFINIOP_TEST(gemm) \ - REGISTER_INFINIOP_TEST(random_sample) \ - REGISTER_INFINIOP_TEST(add) \ - REGISTER_INFINIOP_TEST(mul) \ - REGISTER_INFINIOP_TEST(clip) \ - REGISTER_INFINIOP_TEST(swiglu) \ - REGISTER_INFINIOP_TEST(rope) \ - REGISTER_INFINIOP_TEST(rms_norm) \ - REGISTER_INFINIOP_TEST(causal_softmax) \ - REGISTER_INFINIOP_TEST(rearrange) \ - REGISTER_INFINIOP_TEST(sub) \ - REGISTER_INFINIOP_TEST(reduce_mean) \ - REGISTER_INFINIOP_TEST(reduce_max) \ - } +#define TEST_BUILDER_MAPPINGS \ + { \ + REGISTER_INFINIOP_TEST(gemm) \ + REGISTER_INFINIOP_TEST(random_sample) \ + REGISTER_INFINIOP_TEST(add) \ + REGISTER_INFINIOP_TEST(mul) \ + REGISTER_INFINIOP_TEST(clip) \ + REGISTER_INFINIOP_TEST(swiglu) \ + REGISTER_INFINIOP_TEST(rope) \ + REGISTER_INFINIOP_TEST(rms_norm) \ + REGISTER_INFINIOP_TEST(causal_softmax) \ + REGISTER_INFINIOP_TEST(rearrange) \ + REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(reduce_mean) \ + REGISTER_INFINIOP_TEST(reduce_max)} namespace infiniop_test { diff --git a/src/infiniop-test/include/test.hpp b/src/infiniop-test/include/test.hpp index 58b56b877..277061029 100644 --- a/src/infiniop-test/include/test.hpp +++ b/src/infiniop-test/include/test.hpp @@ -47,7 +47,7 @@ std::vector> runAllTests( const GGUFFileReader &, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations, - double rtol, double atol, bool equal_nan=false); + double rtol, double atol, bool equal_nan = false); // Run a single test read from a GGUF file std::shared_ptr runTest( @@ -55,8 +55,8 @@ std::shared_ptr runTest( infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations, double rtol, double atol, - size_t test_id, - bool equal_nan=false); + size_t test_id, + bool equal_nan = false); // Check if two tensors are close within given tolerance void allClose(std::shared_ptr actual, std::shared_ptr expected, double rtol = 1e-3, double atol = 1e-3, bool equal_nan = false); @@ -82,38 +82,38 @@ class Test { } // namespace infiniop_test::base // Quick macro for declaring a new testcase -#define DECLARE_INFINIOP_TEST(name) \ - namespace infiniop_test::name { \ - class Test : public infiniop_test::base::Test { \ - double _rtol, _atol; \ - bool _equal_nan; \ - \ - public: \ - static std::string op_name() { return #name; } \ - static std::shared_ptr build( \ - std::unordered_map> attributes, \ - std::unordered_map> tensors, \ - double, double, bool); \ - \ - static std::vector attribute_names(); \ - static std::vector tensor_names(); \ - static std::vector output_names(); \ - \ - std::shared_ptr run( \ - infiniopHandle_t handle, infiniDevice_t device, int device_id, \ - size_t warm_ups, size_t iterations) override; \ - \ - std::string toString() const override; \ - \ - ~Test(); \ - \ - private: \ - struct Attributes; \ - Attributes *_attributes; \ - Test() = delete; \ - Test(double rtol, double atol, bool equal_nan = false) \ - : _rtol(rtol), _atol(atol), _equal_nan(equal_nan) {} \ - }; \ +#define DECLARE_INFINIOP_TEST(name) \ + namespace infiniop_test::name { \ + class Test : public infiniop_test::base::Test { \ + double _rtol, _atol; \ + bool _equal_nan; \ + \ + public: \ + static std::string op_name() { return #name; } \ + static std::shared_ptr build( \ + std::unordered_map> attributes, \ + std::unordered_map> tensors, \ + double, double, bool); \ + \ + static std::vector attribute_names(); \ + static std::vector tensor_names(); \ + static std::vector output_names(); \ + \ + std::shared_ptr run( \ + infiniopHandle_t handle, infiniDevice_t device, int device_id, \ + size_t warm_ups, size_t iterations) override; \ + \ + std::string toString() const override; \ + \ + ~Test(); \ + \ + private: \ + struct Attributes; \ + Attributes *_attributes; \ + Test() = delete; \ + Test(double rtol, double atol, bool equal_nan = false) \ + : _rtol(rtol), _atol(atol), _equal_nan(equal_nan) {} \ + }; \ } namespace infiniop_test { diff --git a/src/infiniop-test/src/main.cpp b/src/infiniop-test/src/main.cpp index 9436cd85a..6805bd7f8 100644 --- a/src/infiniop-test/src/main.cpp +++ b/src/infiniop-test/src/main.cpp @@ -1,8 +1,8 @@ #include "gguf.hpp" #include "test.hpp" +#include #include #include -#include struct ParsedArgs { std::string file_path; // Mandatory argument: test.gguf file path infiniDevice_t device_type = INFINI_DEVICE_CPU; // Default to CPU @@ -96,8 +96,9 @@ ParsedArgs parseArgs(int argc, char *argv[]) { args.rtol = std::stod(argv[++i]); } else if (arg == "--equal-nan" && i + 1 < argc) { - args.equal_nan = (strcmp(argv[++i], "True") == 0 || strcmp(argv[i], "true") == 0) - ? true : false; + args.equal_nan = (strcmp(argv[++i], "True") == 0 || strcmp(argv[i], "true") == 0) + ? true + : false; } else { printUsage(); diff --git a/src/infiniop-test/src/ops/reduce_max.cpp b/src/infiniop-test/src/ops/reduce_max.cpp index 1b10d0e02..37045ec78 100644 --- a/src/infiniop-test/src/ops/reduce_max.cpp +++ b/src/infiniop-test/src/ops/reduce_max.cpp @@ -40,9 +40,9 @@ std::shared_ptr Test::run( infiniopReduceMaxDescriptor_t op_desc; CHECK_OR(infiniopCreateReduceMaxDescriptor(handle, &op_desc, - _attributes->output->desc(), - _attributes->input->desc(), - _attributes->dim), + _attributes->output->desc(), + _attributes->input->desc(), + _attributes->dim), return TEST_FAILED(OP_CREATION_FAILED, "Failed to create ReduceMax descriptor")); auto input = _attributes->input->to(device, device_id); @@ -58,10 +58,10 @@ std::shared_ptr Test::run( } CHECK_OR(infiniopReduceMax(op_desc, - workspace, workspace_size, - output->data(), - input->data(), - nullptr), + workspace, workspace_size, + output->data(), + input->data(), + nullptr), return TEST_FAILED(OP_EXECUTION_FAILED, "ReduceMax execution failed")); try { @@ -75,10 +75,10 @@ std::shared_ptr Test::run( elapsed_time = benchmark( [=]() { infiniopReduceMax(op_desc, - workspace, workspace_size, - output->data(), - input->data(), - nullptr); + workspace, workspace_size, + output->data(), + input->data(), + nullptr); }, warm_ups, iterations); diff --git a/src/infiniop-test/src/ops/reduce_mean.cpp b/src/infiniop-test/src/ops/reduce_mean.cpp index 4f1aa4bdd..9de22d49d 100644 --- a/src/infiniop-test/src/ops/reduce_mean.cpp +++ b/src/infiniop-test/src/ops/reduce_mean.cpp @@ -40,9 +40,9 @@ std::shared_ptr Test::run( infiniopReduceMeanDescriptor_t op_desc; CHECK_OR(infiniopCreateReduceMeanDescriptor(handle, &op_desc, - _attributes->output->desc(), - _attributes->input->desc(), - _attributes->dim), + _attributes->output->desc(), + _attributes->input->desc(), + _attributes->dim), return TEST_FAILED(OP_CREATION_FAILED, "Failed to create ReduceMean descriptor")); auto input = _attributes->input->to(device, device_id); @@ -58,10 +58,10 @@ std::shared_ptr Test::run( } CHECK_OR(infiniopReduceMean(op_desc, - workspace, workspace_size, - output->data(), - input->data(), - nullptr), + workspace, workspace_size, + output->data(), + input->data(), + nullptr), return TEST_FAILED(OP_EXECUTION_FAILED, "ReduceMean execution failed")); try { @@ -75,10 +75,10 @@ std::shared_ptr Test::run( elapsed_time = benchmark( [=]() { infiniopReduceMean(op_desc, - workspace, workspace_size, - output->data(), - input->data(), - nullptr); + workspace, workspace_size, + output->data(), + input->data(), + nullptr); }, warm_ups, iterations); diff --git a/src/infiniop-test/src/test.cpp b/src/infiniop-test/src/test.cpp index d4c0bab49..0cbfe067a 100644 --- a/src/infiniop-test/src/test.cpp +++ b/src/infiniop-test/src/test.cpp @@ -158,15 +158,14 @@ void allClose(std::shared_ptr actual_, std::shared_ptr expected_ for (size_t i = 0; i < total; i++) { double a_ = getVal((char *)actual->data() + actual_offset, actual->ggml_type()); double e_ = getVal((char *)expected->data() + expected_offset, expected->ggml_type()); - if (std::isnan(a_) || std::isnan(e_)){ - if ((equal_nan && (std::isnan(a_) != std::isnan(e_))) || !equal_nan){ - num_failed ++; + if (std::isnan(a_) || std::isnan(e_)) { + if ((equal_nan && (std::isnan(a_) != std::isnan(e_))) || !equal_nan) { + num_failed++; if (num_failed == 0) { first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + "."; } } - } - else{ + } else { if (std::fabs(a_ - e_) > atol && std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) { if (num_failed == 0) { first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + "."; @@ -174,7 +173,7 @@ void allClose(std::shared_ptr actual_, std::shared_ptr expected_ num_failed++; } } - + incrementOffset(actual_offset, actual->strides(), ggmlTypeSize(actual->ggml_type()), expected_offset, expected->strides(), ggmlTypeSize(expected->ggml_type()), counter, shape); diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc index 409a4ca02..5ed28d75e 100644 --- a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc +++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc @@ -23,7 +23,7 @@ infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) { const size_t batch_size = info->shape[0]; const size_t channels = info->shape[1]; const size_t rows = info->shape[2]; - const size_t cols = info->shape[3]; // 规约维度 + const size_t cols = info->shape[3]; // 规约维度 const ptrdiff_t y_batch_stride = info->y_strides[0]; const ptrdiff_t y_channel_stride = info->y_strides[1]; @@ -32,20 +32,20 @@ infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) { const ptrdiff_t x_channel_stride = info->x_strides[1]; const ptrdiff_t x_row_stride = info->x_strides[2]; const ptrdiff_t x_col_stride = info->x_strides[3]; - - #pragma omp parallel for collapse(3) + +#pragma omp parallel for collapse(3) for (size_t batch = 0; batch < batch_size; ++batch) { for (size_t channel = 0; channel < channels; ++channel) { for (size_t row = 0; row < rows; ++row) { - const T* input_start = x + batch * x_batch_stride - + channel * x_channel_stride - + row * x_row_stride; - T* output_ptr = y + batch * y_batch_stride - + channel * y_channel_stride + const T *input_start = x + batch * x_batch_stride + + channel * x_channel_stride + + row * x_row_stride; + T *output_ptr = y + batch * y_batch_stride + + channel * y_channel_stride + row * y_row_stride; - + float max = op::common_cpu::reduce_op::max(input_start, cols, x_col_stride); - + if constexpr (std::is_same::value || std::is_same::value) { *output_ptr = utils::cast(max); } else { @@ -54,7 +54,7 @@ infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) { } } } - + return INFINI_STATUS_SUCCESS; } infiniStatus_t Descriptor::calculate( diff --git a/src/infiniop/ops/reduce_max/info.h b/src/infiniop/ops/reduce_max/info.h index 0babca6e6..99bb5be19 100644 --- a/src/infiniop/ops/reduce_max/info.h +++ b/src/infiniop/ops/reduce_max/info.h @@ -29,24 +29,24 @@ class ReduceMaxInfo { CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE); } CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape()); - if (ndim > 4) return INFINI_STATUS_BAD_TENSOR_SHAPE; - else if (ndim == 0){ + if (ndim > 4) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } else if (ndim == 0) { std::vector shape = {1, 1, 1, 1}; std::vector y_strides = {0, 0, 0, 0}; std::vector x_strides = {0, 0, 0, 0}; return utils::Result(ReduceMaxInfo{ dtype, shape, y_strides, x_strides}); - } - else{ + } else { std::vector shape = x_desc->shape(); std::vector y_strides = y_desc->strides(); std::vector x_strides = x_desc->strides(); - if (dim != (shape.size() - 1)){ + if (dim != (shape.size() - 1)) { std::swap(shape[dim], shape[shape.size() - 1]); std::swap(y_strides[dim], y_strides[shape.size() - 1]); std::swap(x_strides[dim], x_strides[shape.size() - 1]); } - while (shape.size() < 4){ + while (shape.size() < 4) { shape.insert(shape.begin(), 1); y_strides.insert(y_strides.begin(), 0); x_strides.insert(x_strides.begin(), 0); diff --git a/src/infiniop/ops/reduce_max/nvidia/kernel.cuh b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh index 864db1047..fec2f9341 100644 --- a/src/infiniop/ops/reduce_max/nvidia/kernel.cuh +++ b/src/infiniop/ops/reduce_max/nvidia/kernel.cuh @@ -6,7 +6,7 @@ __device__ void ReduceMaxKernel( Tdata *y_, const Tdata *x_, size_t batch, size_t channels, size_t height, size_t width, ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, - ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h; const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h; @@ -16,7 +16,6 @@ __device__ void ReduceMaxKernel( if (threadIdx.x == 0) { *y = max_0; } - } #endif // __REDUCE_MAX_KERNEL_CUH__ diff --git a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu index 3aa8e5ff5..f64f596f4 100644 --- a/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu +++ b/src/infiniop/ops/reduce_max/nvidia/reduce_max_nvidia.cu @@ -47,7 +47,7 @@ infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, cudaStream_t stream) { - dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); + dim3 grid = dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); if (dtype == INFINI_DTYPE_F16) { ReduceMax <<>>((half *)y, (const half *)x, @@ -80,17 +80,17 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { CHECK_STATUS(launchKernel( y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], - _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { CHECK_STATUS(launchKernel( y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], - _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { CHECK_STATUS(launchKernel( y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], - _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else { return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; diff --git a/src/infiniop/ops/reduce_max/operator.cc b/src/infiniop/ops/reduce_max/operator.cc index c22ebeb1b..fc8a976b6 100644 --- a/src/infiniop/ops/reduce_max/operator.cc +++ b/src/infiniop/ops/reduce_max/operator.cc @@ -22,13 +22,13 @@ __C infiniStatus_t infiniopCreateReduceMaxDescriptor( infiniopTensorDescriptor_t input_desc, size_t dim) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ - return op::reduce_max::NAMESPACE::Descriptor::create( \ - handle, \ - reinterpret_cast(desc_ptr), \ - output_desc, \ - input_desc, \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ + return op::reduce_max::NAMESPACE::Descriptor::create( \ + handle, \ + reinterpret_cast(desc_ptr), \ + output_desc, \ + input_desc, \ dim); switch (handle->device) { @@ -44,20 +44,20 @@ __C infiniStatus_t infiniopCreateReduceMaxDescriptor( #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax) #endif -// #ifdef ENABLE_ASCEND_API -// CREATE(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_CAMBRICON_MLU -// case DevCambriconMlu: { -// return bangCreateReduceMaxDescriptor((BangHandle_t)handle, (ReduceMaxBangDescriptor_t *)desc_ptr, output_desc, input_desc, dim); -// // return cnnlCreateReduceMaxDescriptor((BangHandle_t) handle, (ReduceMaxCnnlDescriptor_t *) desc_ptr, output_desc, input_desc, dim); -// } -// #endif -// #ifdef ENABLE_MTHREADS_GPU -// case DevMthreadsGpu: { -// return musaCreateReduceMaxDescriptor((MusaHandle_t)handle, (ReduceMaxMusaDescriptor_t *)desc_ptr, output_desc, input_desc, dim); -// } -// #endif + // #ifdef ENABLE_ASCEND_API + // CREATE(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangCreateReduceMaxDescriptor((BangHandle_t)handle, (ReduceMaxBangDescriptor_t *)desc_ptr, output_desc, input_desc, dim); + // // return cnnlCreateReduceMaxDescriptor((BangHandle_t) handle, (ReduceMaxCnnlDescriptor_t *) desc_ptr, output_desc, input_desc, dim); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaCreateReduceMaxDescriptor((MusaHandle_t)handle, (ReduceMaxMusaDescriptor_t *)desc_ptr, output_desc, input_desc, dim); + // } + // #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -82,20 +82,20 @@ __C infiniStatus_t infiniopGetReduceMaxWorkspaceSize(infiniopReduceMaxDescriptor #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax) #endif -// #ifdef ENABLE_ASCEND_API -// GET(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_CAMBRICON_MLU -// case DevCambriconMlu: { -// return bangGetReduceMaxWorkspaceSize((ReduceMaxBangDescriptor_t)desc, size); -// // return cnnlGetReduceMaxWorkspaceSize((ReduceMaxCnnlDescriptor_t) desc, size); -// } -// #endif -// #ifdef ENABLE_MTHREADS_GPU -// case DevMthreadsGpu: { -// return musaGetReduceMaxWorkspaceSize((ReduceMaxMusaDescriptor_t)desc, size); -// } -// #endif + // #ifdef ENABLE_ASCEND_API + // GET(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangGetReduceMaxWorkspaceSize((ReduceMaxBangDescriptor_t)desc, size); + // // return cnnlGetReduceMaxWorkspaceSize((ReduceMaxCnnlDescriptor_t) desc, size); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaGetReduceMaxWorkspaceSize((ReduceMaxMusaDescriptor_t)desc, size); + // } + // #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -107,9 +107,9 @@ __C infiniStatus_t infiniopReduceMax( const void *input, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ - return reinterpret_cast(desc)->calculate( \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ + return reinterpret_cast(desc)->calculate( \ workspace, workspace_size, output, input, stream); switch (desc->device_type) { @@ -125,29 +125,29 @@ __C infiniStatus_t infiniopReduceMax( #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax) #endif -// #ifdef ENABLE_ASCEND_API -// CALCULATE(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_CAMBRICON_MLU -// case DevCambriconMlu: { -// return bangReduceMax((ReduceMaxBangDescriptor_t)desc, workspace, workspace_size, output, input, stream); -// // return cnnlReduceMax((ReduceMaxCnnlDescriptor_t) desc, workspace, workspace_size, output, input, stream); -// } -// #endif -// #ifdef ENABLE_MTHREADS_GPU -// case DevMthreadsGpu: { -// return musaReduceMax((ReduceMaxMusaDescriptor_t)desc, workspace, workspace_size, output, input, stream); -// } -// #endif + // #ifdef ENABLE_ASCEND_API + // CALCULATE(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangReduceMax((ReduceMaxBangDescriptor_t)desc, workspace, workspace_size, output, input, stream); + // // return cnnlReduceMax((ReduceMaxCnnlDescriptor_t) desc, workspace, workspace_size, output, input, stream); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaReduceMax((ReduceMaxMusaDescriptor_t)desc, workspace, workspace_size, output, input, stream); + // } + // #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } __C infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescriptor_t desc) { -#define DESTROY(CASE, NAMESPACE) \ - case CASE: \ - delete reinterpret_cast(desc); \ +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ + delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS; switch (desc->device_type) { @@ -163,19 +163,19 @@ __C infiniStatus_t infiniopDestroyReduceMaxDescriptor(infiniopReduceMaxDescripto #ifdef ENABLE_METAX_API DESTROY(INFINI_DEVICE_METAX, metax) #endif -// #ifdef ENABLE_ASCEND_API -// DESTROY(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_CAMBRICON_MLU -// case DevCambriconMlu: { -// return bangDestroyReduceMaxDescriptor((ReduceMaxBangDescriptor_t)desc); -// // return cnnlDestroyReduceMaxDescriptor((ReduceMaxCnnlDescriptor_t) desc); -// } -// #endif -// #ifdef ENABLE_MTHREADS_GPU -// case DevMthreadsGpu: -// return musaDestroyReduceMaxDescriptor((ReduceMaxMusaDescriptor_t)desc); -// #endif + // #ifdef ENABLE_ASCEND_API + // DESTROY(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangDestroyReduceMaxDescriptor((ReduceMaxBangDescriptor_t)desc); + // // return cnnlDestroyReduceMaxDescriptor((ReduceMaxCnnlDescriptor_t) desc); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: + // return musaDestroyReduceMaxDescriptor((ReduceMaxMusaDescriptor_t)desc); + // #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } \ No newline at end of file diff --git a/src/infiniop/ops/reduce_max/reduce_max.h b/src/infiniop/ops/reduce_max/reduce_max.h index 412f67162..6ef8630f2 100644 --- a/src/infiniop/ops/reduce_max/reduce_max.h +++ b/src/infiniop/ops/reduce_max/reduce_max.h @@ -7,7 +7,7 @@ #define DESCRIPTOR(NAMESPACE) \ \ namespace op::reduce_max::NAMESPACE { \ - class Descriptor final : public InfiniopDescriptor { \ + class Descriptor final : public InfiniopDescriptor { \ struct Opaque; \ Opaque *_opaque; \ ReduceMaxInfo _info; \ diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc index b56184ef5..5eb2b5419 100644 --- a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc +++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc @@ -23,7 +23,7 @@ infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) { const size_t batch_size = info->shape[0]; const size_t channels = info->shape[1]; const size_t rows = info->shape[2]; - const size_t cols = info->shape[3]; // 规约维度 + const size_t cols = info->shape[3]; // 规约维度 const ptrdiff_t y_batch_stride = info->y_strides[0]; const ptrdiff_t y_channel_stride = info->y_strides[1]; @@ -32,20 +32,20 @@ infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) { const ptrdiff_t x_channel_stride = info->x_strides[1]; const ptrdiff_t x_row_stride = info->x_strides[2]; const ptrdiff_t x_col_stride = info->x_strides[3]; - - #pragma omp parallel for collapse(3) + +#pragma omp parallel for collapse(3) for (size_t batch = 0; batch < batch_size; ++batch) { for (size_t channel = 0; channel < channels; ++channel) { for (size_t row = 0; row < rows; ++row) { - const T* input_start = x + batch * x_batch_stride - + channel * x_channel_stride - + row * x_row_stride; - T* output_ptr = y + batch * y_batch_stride - + channel * y_channel_stride + const T *input_start = x + batch * x_batch_stride + + channel * x_channel_stride + + row * x_row_stride; + T *output_ptr = y + batch * y_batch_stride + + channel * y_channel_stride + row * y_row_stride; - + float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols; - + if constexpr (std::is_same::value || std::is_same::value) { *output_ptr = utils::cast(mean); } else { @@ -54,7 +54,7 @@ infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) { } } } - + return INFINI_STATUS_SUCCESS; } infiniStatus_t Descriptor::calculate( diff --git a/src/infiniop/ops/reduce_mean/info.h b/src/infiniop/ops/reduce_mean/info.h index a47c81133..6c11e07d3 100644 --- a/src/infiniop/ops/reduce_mean/info.h +++ b/src/infiniop/ops/reduce_mean/info.h @@ -29,24 +29,24 @@ class ReduceMeanInfo { CHECK_STATUS(INFINI_STATUS_BAD_TENSOR_SHAPE); } CHECK_REDUCE_SHAPE(x_desc->shape(), dim, y_desc->shape()); - if (ndim > 4) return INFINI_STATUS_BAD_TENSOR_SHAPE; - else if (ndim == 0){ + if (ndim > 4) { + return INFINI_STATUS_BAD_TENSOR_SHAPE; + } else if (ndim == 0) { std::vector shape = {1, 1, 1, 1}; std::vector y_strides = {0, 0, 0, 0}; std::vector x_strides = {0, 0, 0, 0}; return utils::Result(ReduceMeanInfo{ dtype, shape, y_strides, x_strides}); - } - else{ + } else { std::vector shape = x_desc->shape(); std::vector y_strides = y_desc->strides(); std::vector x_strides = x_desc->strides(); - if (dim != (shape.size() - 1)){ + if (dim != (shape.size() - 1)) { std::swap(shape[dim], shape[shape.size() - 1]); std::swap(y_strides[dim], y_strides[shape.size() - 1]); std::swap(x_strides[dim], x_strides[shape.size() - 1]); } - while (shape.size() < 4){ + while (shape.size() < 4) { shape.insert(shape.begin(), 1); y_strides.insert(y_strides.begin(), 0); x_strides.insert(x_strides.begin(), 0); diff --git a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh index 54914d98a..e70748605 100644 --- a/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh +++ b/src/infiniop/ops/reduce_mean/nvidia/kernel.cuh @@ -6,7 +6,7 @@ __device__ void ReduceMeanKernel( Tdata *y_, const Tdata *x_, size_t batch, size_t channels, size_t height, size_t width, ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, - ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { + ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w) { Tdata *y = y_ + blockIdx.x * y_stride_b + blockIdx.y * y_stride_c + blockIdx.z * y_stride_h; const Tdata *x = x_ + blockIdx.x * x_stride_b + blockIdx.y * x_stride_c + blockIdx.z * x_stride_h; @@ -15,7 +15,7 @@ __device__ void ReduceMeanKernel( Tcompute sum_0 = op::common_cuda::reduce_op::sum(x, width, x_stride_w); if (threadIdx.x == 0) { // mean_ = sum_0/width; - *y = sum_0/width; + *y = sum_0 / width; } // __syncthreads(); diff --git a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu index 26fd4e8ee..bfc26e23d 100644 --- a/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu +++ b/src/infiniop/ops/reduce_mean/nvidia/reduce_mean_nvidia.cu @@ -47,7 +47,7 @@ infiniStatus_t launchKernel(void *y, const void *x, infiniDtype_t dtype, ptrdiff_t y_stride_b, ptrdiff_t y_stride_c, ptrdiff_t y_stride_h, ptrdiff_t x_stride_b, ptrdiff_t x_stride_c, ptrdiff_t x_stride_h, ptrdiff_t x_stride_w, cudaStream_t stream) { - dim3 grid=dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); + dim3 grid = dim3(uint32_t(batch_size), uint32_t(channels), uint32_t(height)); if (dtype == INFINI_DTYPE_F16) { ReduceMean <<>>((half *)y, (const half *)x, @@ -80,17 +80,17 @@ infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size, if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) { CHECK_STATUS(launchKernel( y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], - _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) { CHECK_STATUS(launchKernel( y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], - _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) { CHECK_STATUS(launchKernel( y, x, _info.dtype, _info.shape[0], _info.shape[1], _info.shape[2], _info.shape[3], - _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], + _info.y_strides[0], _info.y_strides[1], _info.y_strides[2], _info.x_strides[0], _info.x_strides[1], _info.x_strides[2], _info.x_strides[3], stream)); } else { return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED; diff --git a/src/infiniop/ops/reduce_mean/operator.cc b/src/infiniop/ops/reduce_mean/operator.cc index 32e4e3619..3696f95d2 100644 --- a/src/infiniop/ops/reduce_mean/operator.cc +++ b/src/infiniop/ops/reduce_mean/operator.cc @@ -22,13 +22,13 @@ __C infiniStatus_t infiniopCreateReduceMeanDescriptor( infiniopTensorDescriptor_t x_desc, size_t dim) { -#define CREATE(CASE, NAMESPACE) \ - case CASE: \ +#define CREATE(CASE, NAMESPACE) \ + case CASE: \ return op::reduce_mean::NAMESPACE::Descriptor::create( \ - handle, \ + handle, \ reinterpret_cast(desc_ptr), \ - y_desc, \ - x_desc, \ + y_desc, \ + x_desc, \ dim); switch (handle->device) { @@ -44,28 +44,28 @@ __C infiniStatus_t infiniopCreateReduceMeanDescriptor( #ifdef ENABLE_METAX_API CREATE(INFINI_DEVICE_METAX, metax) #endif -// #ifdef ENABLE_ASCEND_API -// CREATE(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_CAMBRICON_MLU -// case DevCambriconMlu: { -// return bangCreateCausalSoftmaxDescriptor((BangHandle_t)handle, (CausalSoftmaxBangDescriptor_t *)desc_ptr, y_desc); -// // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc); -// } -// #endif -// #ifdef ENABLE_MTHREADS_GPU -// case DevMthreadsGpu: { -// return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc); -// } -// #endif + // #ifdef ENABLE_ASCEND_API + // CREATE(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangCreateCausalSoftmaxDescriptor((BangHandle_t)handle, (CausalSoftmaxBangDescriptor_t *)desc_ptr, y_desc); + // // return cnnlCreateCausalSoftmaxDescriptor((BangHandle_t) handle, (CausalSoftmaxCnnlDescriptor_t *) desc_ptr, y_desc); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaCreateCausalSoftmaxDescriptor((MusaHandle_t)handle, (CausalSoftmaxMusaDescriptor_t *)desc_ptr, y_desc); + // } + // #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } __C infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescriptor_t desc, size_t *size) { -#define GET(CASE, NAMESPACE) \ - case CASE: \ +#define GET(CASE, NAMESPACE) \ + case CASE: \ *size = reinterpret_cast(desc)->workspaceSize(); \ return INFINI_STATUS_SUCCESS; @@ -82,21 +82,21 @@ __C infiniStatus_t infiniopGetReduceMeanWorkspaceSize(infiniopReduceMeanDescript #ifdef ENABLE_METAX_API GET(INFINI_DEVICE_METAX, metax) #endif -// #ifdef ENABLE_ASCEND_API -// GET(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_CAMBRICON_MLU -// case DevCambriconMlu: { -// return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t)desc, size); -// // return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size); -// } + // #ifdef ENABLE_ASCEND_API + // GET(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangGetCausalSoftmaxWorkspaceSize((CausalSoftmaxBangDescriptor_t)desc, size); + // // return cnnlGetCausalSoftmaxWorkspaceSize((CausalSoftmaxCnnlDescriptor_t) desc, size); + // } -// #endif -// #ifdef ENABLE_MTHREADS_GPU -// case DevMthreadsGpu: { -// return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size); -// } -// #endif + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaGetCausalSoftmaxWorkspaceSize((CausalSoftmaxMusaDescriptor_t)desc, size); + // } + // #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } @@ -108,8 +108,8 @@ __C infiniStatus_t infiniopReduceMean( const void *x, void *stream) { -#define CALCULATE(CASE, NAMESPACE) \ - case CASE: \ +#define CALCULATE(CASE, NAMESPACE) \ + case CASE: \ return reinterpret_cast(desc)->calculate( \ workspace, workspace_size, y, x, stream); @@ -126,28 +126,28 @@ __C infiniStatus_t infiniopReduceMean( #ifdef ENABLE_METAX_API CALCULATE(INFINI_DEVICE_METAX, metax) #endif -// #ifdef ENABLE_ASCEND_API -// CALCULATE(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_CAMBRICON_MLU -// case DevCambriconMlu: { -// return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t)desc, workspace, workspace_size, data, stream); -// // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream); -// } -// #endif -// #ifdef ENABLE_MTHREADS_GPU -// case DevMthreadsGpu: { -// return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream); -// } -// #endif + // #ifdef ENABLE_ASCEND_API + // CALCULATE(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangCausalSoftmax((CausalSoftmaxBangDescriptor_t)desc, workspace, workspace_size, data, stream); + // // return cnnlCausalSoftmax((CausalSoftmaxCnnlDescriptor_t) desc, workspace, workspace_size, data, stream); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: { + // return musaCausalSoftmax((CausalSoftmaxMusaDescriptor_t)desc, workspace, workspace_size, data, stream); + // } + // #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } __C infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescriptor_t desc) { -#define DESTROY(CASE, NAMESPACE) \ - case CASE: \ +#define DESTROY(CASE, NAMESPACE) \ + case CASE: \ delete reinterpret_cast(desc); \ return INFINI_STATUS_SUCCESS; @@ -164,19 +164,19 @@ __C infiniStatus_t infiniopDestroyReduceMeanDescriptor(infiniopReduceMeanDescrip #ifdef ENABLE_METAX_API DESTROY(INFINI_DEVICE_METAX, metax) #endif -// #ifdef ENABLE_ASCEND_API -// DESTROY(INFINI_DEVICE_ASCEND, ascend) -// #endif -// #ifdef ENABLE_CAMBRICON_MLU -// case DevCambriconMlu: { -// return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t)desc); -// // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc); -// } -// #endif -// #ifdef ENABLE_MTHREADS_GPU -// case DevMthreadsGpu: -// return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc); -// #endif + // #ifdef ENABLE_ASCEND_API + // DESTROY(INFINI_DEVICE_ASCEND, ascend) + // #endif + // #ifdef ENABLE_CAMBRICON_MLU + // case DevCambriconMlu: { + // return bangDestroyCausalSoftmaxDescriptor((CausalSoftmaxBangDescriptor_t)desc); + // // return cnnlDestroyCausalSoftmaxDescriptor((CausalSoftmaxCnnlDescriptor_t) desc); + // } + // #endif + // #ifdef ENABLE_MTHREADS_GPU + // case DevMthreadsGpu: + // return musaDestroyCausalSoftmaxDescriptor((CausalSoftmaxMusaDescriptor_t)desc); + // #endif } return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; } diff --git a/src/infiniop/ops/reduce_mean/reduce_mean.h b/src/infiniop/ops/reduce_mean/reduce_mean.h index 58fa566ea..bf2e2dda0 100644 --- a/src/infiniop/ops/reduce_mean/reduce_mean.h +++ b/src/infiniop/ops/reduce_mean/reduce_mean.h @@ -6,16 +6,16 @@ #define DESCRIPTOR(NAMESPACE) \ \ - namespace op::reduce_mean::NAMESPACE { \ + namespace op::reduce_mean::NAMESPACE { \ class Descriptor final : public InfiniopDescriptor { \ struct Opaque; \ Opaque *_opaque; \ - ReduceMeanInfo _info; \ + ReduceMeanInfo _info; \ size_t _workspace_size; \ \ Descriptor( \ Opaque *opaque, \ - ReduceMeanInfo info, \ + ReduceMeanInfo info, \ size_t workspace_size, \ infiniDevice_t device_type, \ int device_id) \ diff --git a/src/infiniop/reduce/cuda/reduce.cuh b/src/infiniop/reduce/cuda/reduce.cuh index b69b46163..cd98d0936 100644 --- a/src/infiniop/reduce/cuda/reduce.cuh +++ b/src/infiniop/reduce/cuda/reduce.cuh @@ -63,10 +63,9 @@ __device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count) { // Sum(x) on non-contiguous data of length count template -__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, - size_t count, - ptrdiff_t stride) -{ +__device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, + size_t count, + ptrdiff_t stride) { Tcompute s = 0; for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { @@ -81,10 +80,9 @@ __device__ __forceinline__ Tcompute sum(const Tdata *data_ptr, // Max(x) on non-contiguous data of length count template -__device__ __forceinline__ Tdata max(const Tdata *data_ptr, +__device__ __forceinline__ Tdata max(const Tdata *data_ptr, size_t count, - ptrdiff_t stride) -{ + ptrdiff_t stride) { Tdata max_ = data_ptr[0]; for (size_t i = threadIdx.x; i < count; i += BLOCK_SIZE) { diff --git a/src/infinirt/infinirt_impl.h b/src/infinirt/infinirt_impl.h index 0d6f8cf05..4c41a1198 100644 --- a/src/infinirt/infinirt_impl.h +++ b/src/infinirt/infinirt_impl.h @@ -30,7 +30,6 @@ infiniStatus_t freeAsync(void *ptr, infinirtStream_t stream) IMPL; #define INFINIRT_DEVICE_API_IMPL INFINIRT_DEVICE_API(, ) -#define INFINIRT_DEVICE_API_NOOP INFINIRT_DEVICE_API({ return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; }, \ - {*count = 0; return INFINI_STATUS_SUCCESS; }) +#define INFINIRT_DEVICE_API_NOOP INFINIRT_DEVICE_API({ return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; }, {*count = 0; return INFINI_STATUS_SUCCESS; }) #endif // __INFINIRT_IMPL_H__ diff --git a/src/utils.h b/src/utils.h index f4e63be25..af222baca 100644 --- a/src/utils.h +++ b/src/utils.h @@ -98,7 +98,7 @@ inline std::string infiniDtypeToString(infiniDtype_t dtype) { } } -#define CEIL_DIV(x, y) (((x) + (y)-1) / (y)) +#define CEIL_DIV(x, y) (((x) + (y) - 1) / (y)) namespace utils { diff --git a/src/utils/check.h b/src/utils/check.h index 72a0995fd..76823aa40 100644 --- a/src/utils/check.h +++ b/src/utils/check.h @@ -59,22 +59,22 @@ #define CHECK_SAME_STRIDES(FIRST, ...) CHECK_SAME_VEC(INFINI_STATUS_BAD_TENSOR_STRIDES, FIRST, __VA_ARGS__) -#define CHECK_REDUCE_SHAPE(INPUT_SHAPE, DIM, EXPECTED_SHAPE) \ - do { \ - if (INPUT_SHAPE.empty()) { \ - if (!EXPECTED_SHAPE.empty()) { \ - return INFINI_STATUS_BAD_TENSOR_SHAPE; \ - } \ - break; \ - } \ - if (DIM >= INPUT_SHAPE.size()) { \ - return INFINI_STATUS_BAD_PARAM; \ - } \ - std::vector reduced_shape = INPUT_SHAPE; \ - reduced_shape[DIM] = 1; \ - if (reduced_shape != EXPECTED_SHAPE) { \ - return INFINI_STATUS_BAD_TENSOR_SHAPE; \ - } \ +#define CHECK_REDUCE_SHAPE(INPUT_SHAPE, DIM, EXPECTED_SHAPE) \ + do { \ + if (INPUT_SHAPE.empty()) { \ + if (!EXPECTED_SHAPE.empty()) { \ + return INFINI_STATUS_BAD_TENSOR_SHAPE; \ + } \ + break; \ + } \ + if (DIM >= INPUT_SHAPE.size()) { \ + return INFINI_STATUS_BAD_PARAM; \ + } \ + std::vector reduced_shape = INPUT_SHAPE; \ + reduced_shape[DIM] = 1; \ + if (reduced_shape != EXPECTED_SHAPE) { \ + return INFINI_STATUS_BAD_TENSOR_SHAPE; \ + } \ } while (0) #endif // INFINIUTILS_CHECK_H diff --git a/test/infiniop-test/test_generate/__init__.py b/test/infiniop-test/test_generate/__init__.py index a61f63f7c..8db1e6755 100644 --- a/test/infiniop-test/test_generate/__init__.py +++ b/test/infiniop-test/test_generate/__init__.py @@ -1 +1,8 @@ -from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +from .infiniop_test import ( + InfiniopTestCase, + InfiniopTestWriter, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, + process_zero_stride_tensor, +) diff --git a/test/infiniop-test/test_generate/testcases/add.py b/test/infiniop-test/test_generate/testcases/add.py index b04ba2042..052ef18a7 100644 --- a/test/infiniop-test/test_generate/testcases/add.py +++ b/test/infiniop-test/test_generate/testcases/add.py @@ -4,7 +4,14 @@ from typing import List from numpy.lib.stride_tricks import as_strided -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, + process_zero_stride_tensor, +) def add( @@ -26,7 +33,6 @@ def __init__( c: np.ndarray, shape_c: List[int] | None, stride_c: List[int] | None, - ): super().__init__("add") self.a = a @@ -39,7 +45,6 @@ def __init__( self.shape_c = shape_c self.stride_c = stride_c - def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) if self.shape_a is not None: @@ -49,12 +54,22 @@ def write_test(self, test_writer: "InfiniopTestWriter"): if self.shape_c is not None: test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c) if self.stride_a is not None: - test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)) + test_writer.add_array( + test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a) + ) if self.stride_b is not None: - test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)) + test_writer.add_array( + test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b) + ) test_writer.add_array( test_writer.gguf_key("c.strides"), - gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c)) + gguf_strides( + *( + self.stride_c + if self.stride_c is not None + else contiguous_gguf_strides(self.shape_c) + ) + ), ) test_writer.add_tensor( test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype) @@ -114,7 +129,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"): stride_c=stride_c, ) test_cases.append(test_case) - + test_writer.add_tests(test_cases) test_writer.save() - \ No newline at end of file diff --git a/test/infiniop-test/test_generate/testcases/causal_softmax.py b/test/infiniop-test/test_generate/testcases/causal_softmax.py index 74c3efcf0..037701865 100644 --- a/test/infiniop-test/test_generate/testcases/causal_softmax.py +++ b/test/infiniop-test/test_generate/testcases/causal_softmax.py @@ -4,7 +4,13 @@ from typing import List from enum import Enum, auto -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) def causal_softmax(x): @@ -37,8 +43,8 @@ def __init__( super().__init__("causal_softmax") self.x = x self.y = y - self.shape_x=shape_x - self.shape_y=shape_y + self.shape_x = shape_x + self.shape_y = shape_y self.stride_x = stride_x self.stride_y = stride_y @@ -49,10 +55,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"): if self.shape_y is not None: test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape_y) if self.stride_x is not None: - test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)) + test_writer.add_array( + test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x) + ) test_writer.add_array( test_writer.gguf_key("y.strides"), - gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y)) + gguf_strides( + *( + self.stride_y + if self.stride_y is not None + else contiguous_gguf_strides(self.shape_y) + ) + ), ) test_writer.add_tensor( test_writer.gguf_key("x"), @@ -102,6 +116,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"): stride_y, ) test_cases.append(test_case) - + test_writer.add_tests(test_cases) test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/clip.py b/test/infiniop-test/test_generate/testcases/clip.py index f08a59929..786153197 100644 --- a/test/infiniop-test/test_generate/testcases/clip.py +++ b/test/infiniop-test/test_generate/testcases/clip.py @@ -2,7 +2,13 @@ import gguf from typing import List, Optional, Tuple -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) def clip( @@ -35,7 +41,7 @@ def random_tensor(shape, dtype): Returns: Random tensor with the specified shape and dtype """ - return (np.random.rand(*shape).astype(dtype) * 4.0 - 2.0) + return np.random.rand(*shape).astype(dtype) * 4.0 - 2.0 class ClipTestCase(InfiniopTestCase): @@ -52,7 +58,7 @@ def __init__( max_val: np.ndarray, max_stride: Optional[List[int]], y: np.ndarray, - y_shape: Optional[List[int]], + y_shape: Optional[List[int]], y_stride: Optional[List[int]], ): super().__init__("clip") @@ -63,7 +69,7 @@ def __init__( self.max_val = max_val self.max_stride = max_stride self.y = y - self.y_shape=y_shape + self.y_shape = y_shape self.y_stride = y_stride def write_test(self, test_writer: "InfiniopTestWriter"): @@ -71,57 +77,64 @@ def write_test(self, test_writer: "InfiniopTestWriter"): # Add strides as arrays if they exist if self.x_stride is not None: - test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_stride)) + test_writer.add_array( + test_writer.gguf_key("x.strides"), gguf_strides(*self.x_stride) + ) if self.min_stride is not None: - test_writer.add_array(test_writer.gguf_key("min_val.strides"), gguf_strides(*self.min_stride)) + test_writer.add_array( + test_writer.gguf_key("min_val.strides"), gguf_strides(*self.min_stride) + ) if self.max_stride is not None: - test_writer.add_array(test_writer.gguf_key("max_val.strides"), gguf_strides(*self.max_stride)) + test_writer.add_array( + test_writer.gguf_key("max_val.strides"), gguf_strides(*self.max_stride) + ) if self.y_shape is not None: test_writer.add_array(test_writer.gguf_key("y.shape"), self.y_shape) test_writer.add_array( test_writer.gguf_key("y.strides"), - gguf_strides(*self.y_stride if self.y_stride is not None else contiguous_gguf_strides(self.y_shape)) + gguf_strides( + *( + self.y_stride + if self.y_stride is not None + else contiguous_gguf_strides(self.y_shape) + ) + ), ) # Add tensors to the test test_writer.add_tensor( - test_writer.gguf_key("x"), - self.x, - raw_dtype=np_dtype_to_ggml(self.x.dtype) + test_writer.gguf_key("x"), self.x, raw_dtype=np_dtype_to_ggml(self.x.dtype) ) test_writer.add_tensor( test_writer.gguf_key("min_val"), self.min_val, - raw_dtype=np_dtype_to_ggml(self.min_val.dtype) + raw_dtype=np_dtype_to_ggml(self.min_val.dtype), ) test_writer.add_tensor( test_writer.gguf_key("max_val"), self.max_val, - raw_dtype=np_dtype_to_ggml(self.max_val.dtype) + raw_dtype=np_dtype_to_ggml(self.max_val.dtype), ) test_writer.add_tensor( - test_writer.gguf_key("y"), - self.y, - raw_dtype=np_dtype_to_ggml(self.y.dtype) + test_writer.gguf_key("y"), self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype) ) # Calculate the expected result ans = clip( self.x.astype(np.float64), self.min_val.astype(np.float64), - self.max_val.astype(np.float64) + self.max_val.astype(np.float64), ) # Add the expected result to the test test_writer.add_tensor( - test_writer.gguf_key("ans"), - ans, - raw_dtype=gguf.GGMLQuantizationType.F64 + test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64 ) + if __name__ == "__main__": test_writer = InfiniopTestWriter("clip.gguf") @@ -130,23 +143,23 @@ def write_test(self, test_writer: "InfiniopTestWriter"): # Test case shapes shapes = [ - (10,), # 1D tensor - (5, 10), # 2D tensor - (2, 3, 4), # 3D tensor - (7, 13), # Prime dimensions - (1, 1), # Minimum shape - (100, 100), # Large shape - (16, 16, 16), # Large 3D + (10,), # 1D tensor + (5, 10), # 2D tensor + (2, 3, 4), # 3D tensor + (7, 13), # Prime dimensions + (1, 1), # Minimum shape + (100, 100), # Large shape + (16, 16, 16), # Large 3D ] # Test case min/max values min_max_values = [ - (-1.0, 1.0), # Standard range - (0.0, 2.0), # Positive range - (-2.0, 0.0), # Negative range - (-1000.0, 1000.0), # Large range - (-0.001, 0.001), # Small range - (0.0, 0.0), # min=max + (-1.0, 1.0), # Standard range + (0.0, 2.0), # Positive range + (-2.0, 0.0), # Negative range + (-1000.0, 1000.0), # Large range + (-0.001, 0.001), # Small range + (0.0, 0.0), # min=max ] # Data types to test @@ -171,7 +184,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): max_stride=None, y=y, y_shape=shape, - y_stride=None + y_stride=None, ) ) @@ -199,7 +212,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): max_stride=row_stride, y=y, y_shape=shape, - y_stride=row_stride + y_stride=row_stride, ) ) @@ -219,7 +232,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): max_stride=col_stride, y=y, y_shape=shape, - y_stride=col_stride + y_stride=col_stride, ) ) @@ -239,7 +252,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): max_stride=row_stride, y=y, y_shape=shape, - y_stride=col_stride + y_stride=col_stride, ) ) diff --git a/test/infiniop-test/test_generate/testcases/mul.py b/test/infiniop-test/test_generate/testcases/mul.py index 00c427bcb..ad4f6b806 100644 --- a/test/infiniop-test/test_generate/testcases/mul.py +++ b/test/infiniop-test/test_generate/testcases/mul.py @@ -2,30 +2,36 @@ import gguf from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) -def mul( - a: np.ndarray, - b: np.ndarray -): + +def mul(a: np.ndarray, b: np.ndarray): return np.multiply(a, b) + def random_tensor(shape, dtype): rate = 1e-3 var = 0.5 * rate # 数值范围在[-5e-4, 5e-4] return rate * np.random.rand(*shape).astype(dtype) - var + class MulTestCase(InfiniopTestCase): def __init__( self, a: np.ndarray, - shape_a: List[int] | None, + shape_a: List[int] | None, stride_a: List[int] | None, b: np.ndarray, - shape_b: List[int] | None, + shape_b: List[int] | None, stride_b: List[int] | None, c: np.ndarray, - shape_c: List[int] | None, + shape_c: List[int] | None, stride_c: List[int] | None, ): super().__init__("mul") @@ -39,7 +45,6 @@ def __init__( self.shape_c = shape_c self.stride_c = stride_c - def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) if self.shape_a is not None: @@ -49,12 +54,22 @@ def write_test(self, test_writer: "InfiniopTestWriter"): if self.shape_c is not None: test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c) if self.stride_a is not None: - test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)) + test_writer.add_array( + test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a) + ) if self.stride_b is not None: - test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)) + test_writer.add_array( + test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b) + ) test_writer.add_array( test_writer.gguf_key("c.strides"), - gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c)) + gguf_strides( + *( + self.stride_c + if self.stride_c is not None + else contiguous_gguf_strides(self.shape_c) + ) + ), ) test_writer.add_tensor( @@ -68,7 +83,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): ) a_fp64 = self.a.astype(np.float64) b_fp64 = self.b.astype(np.float64) - + ans_fp64 = np.multiply(a_fp64, b_fp64) ans = mul(self.a, self.b) test_writer.add_tensor( @@ -80,7 +95,8 @@ def write_test(self, test_writer: "InfiniopTestWriter"): raw_dtype=np_dtype_to_ggml(ans_fp64.dtype), ) -if __name__ == '__main__': + +if __name__ == "__main__": test_writer = InfiniopTestWriter("mul.gguf") test_cases = [] @@ -96,16 +112,15 @@ def write_test(self, test_writer: "InfiniopTestWriter"): ((2048, 2560), (2560, 1), (1, 2048), (2560, 1)), ((4, 48, 64), (64 * 48, 64, 1), (1, 4, 192), None), ((4, 48, 64), None, (1, 4, 192), (48 * 64, 64, 1)), - ] + ] _TENSOR_DTYPES_ = [np.float32, np.float16] - + for dtype in _TENSOR_DTYPES_: for shape, stride_a, stride_b, stride_c in _TEST_CASES_: a = random_tensor(shape, dtype) b = random_tensor(shape, dtype) c = np.empty(tuple(0 for _ in shape), dtype=dtype) - test_cases.append( MulTestCase( a=a, @@ -118,7 +133,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): shape_c=shape, stride_c=stride_c, ) - ) - + ) + test_writer.add_tests(test_cases) test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/rearrange.py b/test/infiniop-test/test_generate/testcases/rearrange.py index 9617a1fc0..3d3a0e73b 100644 --- a/test/infiniop-test/test_generate/testcases/rearrange.py +++ b/test/infiniop-test/test_generate/testcases/rearrange.py @@ -1,14 +1,21 @@ import torch from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) + def row_major_strides(shape): """生成张量的行优先stride - + Args: shape: 张量形状 - + Returns: 行优先strides列表 """ @@ -19,12 +26,13 @@ def row_major_strides(shape): strides.insert(0, stride) return strides + def column_major_strides(shape): """生成张量的列优先stride - + Args: shape: 张量形状 - + Returns: 列优先strides列表 """ @@ -35,6 +43,7 @@ def column_major_strides(shape): strides.append(stride) return strides + def rearrange_using_torch(src: torch.Tensor, dst_strides: List[int]) -> torch.Tensor: """ 使用torch的rearrange函数计算结果 @@ -66,27 +75,35 @@ def __init__( self.shape = shape self.src_strides = src_strides self.dst_strides = dst_strides - + def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) - + # 写入形状信息 if self.shape is not None: test_writer.add_array(test_writer.gguf_key("src.shape"), self.shape) test_writer.add_array(test_writer.gguf_key("dst.shape"), self.shape) - + # 写入strides信息 if self.src_strides is not None: - test_writer.add_array(test_writer.gguf_key("src.strides"), gguf_strides(*self.src_strides)) + test_writer.add_array( + test_writer.gguf_key("src.strides"), gguf_strides(*self.src_strides) + ) test_writer.add_array( test_writer.gguf_key("dst.strides"), - gguf_strides(*self.dst_strides if self.dst_strides is not None else contiguous_gguf_strides(self.shape)) + gguf_strides( + *( + self.dst_strides + if self.dst_strides is not None + else contiguous_gguf_strides(self.shape) + ) + ), ) - + # 转换torch tensor为numpy用于写入文件 src_numpy = self.src.detach().cpu().numpy() dst_numpy = self.dst.detach().cpu().numpy() - + # 写入张量数据 test_writer.add_tensor( test_writer.gguf_key("src"), @@ -98,9 +115,13 @@ def write_test(self, test_writer: "InfiniopTestWriter"): dst_numpy, raw_dtype=np_dtype_to_ggml(dst_numpy.dtype), ) - + # 计算并写入答案 - dst_strides_for_ans = self.dst_strides if self.dst_strides is not None else list(contiguous_gguf_strides(self.shape)) + dst_strides_for_ans = ( + self.dst_strides + if self.dst_strides is not None + else list(contiguous_gguf_strides(self.shape)) + ) ans_torch = rearrange_using_torch(self.src, dst_strides_for_ans) ans_numpy = ans_torch.detach().cpu().numpy() test_writer.add_tensor( @@ -109,6 +130,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): raw_dtype=np_dtype_to_ggml(src_numpy.dtype), ) + if __name__ == "__main__": test_writer = InfiniopTestWriter("rearrange.gguf") test_cases = [] @@ -117,12 +139,20 @@ def write_test(self, test_writer: "InfiniopTestWriter"): # (shape, src_stride, dst_stride) ((100, 100), (1, 100), (100, 1)), ((4, 4), (1, 4), (4, 1)), - ((4, 6, 64), (64, 4*64, 1), (6*64, 64, 1)), + ((4, 6, 64), (64, 4 * 64, 1), (6 * 64, 64, 1)), ((2000, 2000), (1, 2000), (2000, 1)), ((2001, 2001), (1, 2001), (2001, 1)), ((2, 2, 2, 4), (16, 8, 4, 1), (16, 8, 1, 2)), - ((3, 4, 7, 53, 9), row_major_strides((3, 4, 7, 53, 9)), column_major_strides((3, 4, 7, 53, 9))), - ((3, 4, 50, 50, 5, 7), row_major_strides((3, 4, 50, 50, 5, 7)), column_major_strides((3, 4, 50, 50, 5, 7))), + ( + (3, 4, 7, 53, 9), + row_major_strides((3, 4, 7, 53, 9)), + column_major_strides((3, 4, 7, 53, 9)), + ), + ( + (3, 4, 50, 50, 5, 7), + row_major_strides((3, 4, 50, 50, 5, 7)), + column_major_strides((3, 4, 50, 50, 5, 7)), + ), ] _TENSOR_DTYPES_ = [torch.float32, torch.float16] @@ -132,7 +162,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): src = torch.rand(*shape, dtype=dtype) # 生成目标张量,使用正确的形状 dst = torch.empty(shape, dtype=dtype) - + test_case = RearrangeTestCase( src=src, dst=dst, @@ -140,7 +170,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): src_strides=src_strides, dst_strides=dst_strides, ) - test_cases.append(test_case) + test_cases.append(test_case) test_writer.add_tests(test_cases) - test_writer.save() + test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/reduce_max.py b/test/infiniop-test/test_generate/testcases/reduce_max.py index 487da489a..868b8b3c4 100644 --- a/test/infiniop-test/test_generate/testcases/reduce_max.py +++ b/test/infiniop-test/test_generate/testcases/reduce_max.py @@ -4,7 +4,13 @@ from typing import List from enum import Enum, auto -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) def reduce_max(x, dim): @@ -31,8 +37,8 @@ def __init__( super().__init__("reduce_max") self.x = x self.y = y - self.shape_x=shape_x - self.shape_y=shape_y + self.shape_x = shape_x + self.shape_y = shape_y self.stride_x = stride_x self.stride_y = stride_y self.dim = dim @@ -45,10 +51,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"): if self.shape_y is not None: test_writer.add_array(test_writer.gguf_key("output.shape"), self.shape_y) if self.stride_x is not None: - test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x)) + test_writer.add_array( + test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x) + ) test_writer.add_array( test_writer.gguf_key("output.strides"), - gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y)) + gguf_strides( + *( + self.stride_y + if self.stride_y is not None + else contiguous_gguf_strides(self.shape_y) + ) + ), ) test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim) test_writer.add_tensor( @@ -61,9 +75,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype), ) - ans = reduce_max( - self.x.astype(np.float64), self.dim - ) + ans = reduce_max(self.x.astype(np.float64), self.dim) test_writer.add_tensor( test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64 ) @@ -77,20 +89,20 @@ def write_test(self, test_writer: "InfiniopTestWriter"): # ============================================================================== # These are not maxt to be imported from other modules _TEST_CASES_ = [ - # y_shape, x_shape, y_stride, x_stride, dim - # ((0,), (0,), (0,), (0,), 0), - ((1, ), (32, ), None, None, 0), - ((1, 4), (1, 4), None, None, 0), - ((1, 1), (1, 4), None, None, 1), - ((16, 1), (16, 2048), None, None, 1), - ((1, 16), (2048, 16), None, None, 0), - ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1), - ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0), - ((4, 4, 1), (4, 4, 2048), None, None, 2), - ((1, 4, 4), (2048, 4, 4), None, None, 0), - ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), - ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0), -] + # y_shape, x_shape, y_stride, x_stride, dim + # ((0,), (0,), (0,), (0,), 0), + ((1,), (32,), None, None, 0), + ((1, 4), (1, 4), None, None, 0), + ((1, 1), (1, 4), None, None, 1), + ((16, 1), (16, 2048), None, None, 1), + ((1, 16), (2048, 16), None, None, 0), + ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1), + ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0), + ((4, 4, 1), (4, 4, 2048), None, None, 2), + ((1, 4, 4), (2048, 4, 4), None, None, 0), + ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), + ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0), + ] _TENSOR_DTYPES_ = [np.float16, np.float32] for dtype in _TENSOR_DTYPES_: @@ -107,6 +119,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"): dim, ) test_cases.append(test_case) - + test_writer.add_tests(test_cases) test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/reduce_mean.py b/test/infiniop-test/test_generate/testcases/reduce_mean.py index 74421d83d..a452a8a89 100644 --- a/test/infiniop-test/test_generate/testcases/reduce_mean.py +++ b/test/infiniop-test/test_generate/testcases/reduce_mean.py @@ -4,7 +4,13 @@ from typing import List from enum import Enum, auto -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) def reduce_mean(x, dim): @@ -31,8 +37,8 @@ def __init__( super().__init__("reduce_mean") self.x = x self.y = y - self.shape_x=shape_x - self.shape_y=shape_y + self.shape_x = shape_x + self.shape_y = shape_y self.stride_x = stride_x self.stride_y = stride_y self.dim = dim @@ -45,10 +51,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"): if self.shape_y is not None: test_writer.add_array(test_writer.gguf_key("output.shape"), self.shape_y) if self.stride_x is not None: - test_writer.add_array(test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x)) + test_writer.add_array( + test_writer.gguf_key("input.strides"), gguf_strides(*self.stride_x) + ) test_writer.add_array( test_writer.gguf_key("output.strides"), - gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y)) + gguf_strides( + *( + self.stride_y + if self.stride_y is not None + else contiguous_gguf_strides(self.shape_y) + ) + ), ) test_writer.add_uint64(test_writer.gguf_key("dim"), self.dim) test_writer.add_tensor( @@ -61,9 +75,7 @@ def write_test(self, test_writer: "InfiniopTestWriter"): self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype), ) - ans = reduce_mean( - self.x.astype(np.float64), self.dim - ) + ans = reduce_mean(self.x.astype(np.float64), self.dim) test_writer.add_tensor( test_writer.gguf_key("ans"), ans, raw_dtype=gguf.GGMLQuantizationType.F64 ) @@ -77,20 +89,20 @@ def write_test(self, test_writer: "InfiniopTestWriter"): # ============================================================================== # These are not meant to be imported from other modules _TEST_CASES_ = [ - # y_shape, x_shape, y_stride, x_stride, dim - # ((0,), (0,), (0,), (0,), 0), - ((1, ), (32, ), None, None, 0), - ((1, 4), (1, 4), None, None, 0), - ((1, 1), (1, 4), None, None, 1), - ((16, 1), (16, 2048), None, None, 1), - ((1, 16), (2048, 16), None, None, 0), - ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1), - ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0), - ((4, 4, 1), (4, 4, 2048), None, None, 2), - ((1, 4, 4), (2048, 4, 4), None, None, 0), - ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), - ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0), -] + # y_shape, x_shape, y_stride, x_stride, dim + # ((0,), (0,), (0,), (0,), 0), + ((1,), (32,), None, None, 0), + ((1, 4), (1, 4), None, None, 0), + ((1, 1), (1, 4), None, None, 1), + ((16, 1), (16, 2048), None, None, 1), + ((1, 16), (2048, 16), None, None, 0), + ((16, 1), (16, 2048), (4096, 1), (4096, 1), 1), + ((1, 2048), (16, 2048), (4096, 1), (4096, 1), 0), + ((4, 4, 1), (4, 4, 2048), None, None, 2), + ((1, 4, 4), (2048, 4, 4), None, None, 0), + ((4, 1, 4), (4, 2048, 4), (45056, 5632, 1), (32768, 8, 1), 1), + ((1, 8, 4, 8), (16, 8, 4, 8), (256, 32, 8, 1), (256, 32, 8, 1), 0), + ] _TENSOR_DTYPES_ = [np.float16, np.float32] for dtype in _TENSOR_DTYPES_: @@ -107,6 +119,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"): dim, ) test_cases.append(test_case) - + test_writer.add_tests(test_cases) test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/rms_norm.py b/test/infiniop-test/test_generate/testcases/rms_norm.py index 681ebafc4..9332c090a 100644 --- a/test/infiniop-test/test_generate/testcases/rms_norm.py +++ b/test/infiniop-test/test_generate/testcases/rms_norm.py @@ -1,11 +1,19 @@ import numpy as np from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) + def random_tensor(shape: tuple, dtype: np.dtype) -> np.ndarray: return np.random.uniform(-1.0, 1.0, shape).astype(dtype) * 0.001 + def rms_norm(x: np.ndarray, w: np.ndarray, epsilon: float) -> np.ndarray: """ 使用numpy计算rms_norm结果 @@ -16,13 +24,14 @@ def rms_norm(x: np.ndarray, w: np.ndarray, epsilon: float) -> np.ndarray: Returns: 输出张量, 形状与 input 相同 """ - squared = x ** 2 + squared = x**2 mean = np.mean(squared, axis=-1, keepdims=True) rms = np.sqrt(mean + epsilon) - + normalized = x / rms return normalized * w + class RMSNormTestCase(InfiniopTestCase): def __init__( self, @@ -40,9 +49,9 @@ def __init__( self.y = y self.shape = shape self.epsilon = epsilon - self.x_strides=x_strides - self.y_strides=y_strides - + self.x_strides = x_strides + self.y_strides = y_strides + def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) test_writer.add_float32(test_writer.gguf_key("epsilon"), self.epsilon) @@ -50,10 +59,18 @@ def write_test(self, test_writer: "InfiniopTestWriter"): test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape) test_writer.add_array(test_writer.gguf_key("y.shape"), self.shape) if self.x_strides is not None: - test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides)) + test_writer.add_array( + test_writer.gguf_key("x.strides"), gguf_strides(*self.x_strides) + ) test_writer.add_array( test_writer.gguf_key("y.strides"), - gguf_strides(*self.y_strides if self.y_strides is not None else contiguous_gguf_strides(self.shape)) + gguf_strides( + *( + self.y_strides + if self.y_strides is not None + else contiguous_gguf_strides(self.shape) + ) + ), ) test_writer.add_tensor( test_writer.gguf_key("x"), @@ -70,13 +87,16 @@ def write_test(self, test_writer: "InfiniopTestWriter"): self.y, raw_dtype=np_dtype_to_ggml(self.y.dtype), ) - ans = rms_norm(self.x.astype(np.float64), self.w.astype(np.float64), self.epsilon) + ans = rms_norm( + self.x.astype(np.float64), self.w.astype(np.float64), self.epsilon + ) test_writer.add_tensor( test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(np.float64), ) + if __name__ == "__main__": test_writer = InfiniopTestWriter("rms_norm.gguf") test_cases = [] @@ -112,9 +132,9 @@ def write_test(self, test_writer: "InfiniopTestWriter"): shape=shape, x_strides=x_strides, y_strides=y_strides, - epsilon=epsilon + epsilon=epsilon, ) - test_cases.append(test_case) + test_cases.append(test_case) test_writer.add_tests(test_cases) test_writer.save() diff --git a/test/infiniop-test/test_generate/testcases/rope.py b/test/infiniop-test/test_generate/testcases/rope.py index 85d9685dd..27f5a06db 100644 --- a/test/infiniop-test/test_generate/testcases/rope.py +++ b/test/infiniop-test/test_generate/testcases/rope.py @@ -4,11 +4,17 @@ from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, +) def rotary_embedding(t, sin, cos): - dh = t.shape[2] + dh = t.shape[2] assert dh % 2 == 0, "Embedding dimension must be even." t_even = t[..., 0::2] # [seq_len, n_head, dh // 2] @@ -30,7 +36,9 @@ def rotary_embedding(t, sin, cos): def sin_cos_table(pos, dim, theta, dtype): assert dim % 2 == 0, "Embedding dimension must be even." - freqs = 1.0 / (theta ** (np.arange(0, dim, 2)[: (dim // 2)].astype(np.float32) / dim)) + freqs = 1.0 / ( + theta ** (np.arange(0, dim, 2)[: (dim // 2)].astype(np.float32) / dim) + ) angles = np.outer(pos, freqs) @@ -79,19 +87,33 @@ def write_test(self, test_writer: "InfiniopTestWriter"): test_writer.add_array(test_writer.gguf_key("x.shape"), self.shape_x) test_writer.add_array( test_writer.gguf_key("y.strides"), - gguf_strides(*self.stride_y if self.stride_y is not None else contiguous_gguf_strides(self.shape_y)) + gguf_strides( + *( + self.stride_y + if self.stride_y is not None + else contiguous_gguf_strides(self.shape_y) + ) + ), ) if self.stride_x is not None: - test_writer.add_array(test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x)) + test_writer.add_array( + test_writer.gguf_key("x.strides"), gguf_strides(*self.stride_x) + ) test_writer.add_tensor( - test_writer.gguf_key("pos_ids"), self.pos_ids, raw_dtype=np_dtype_to_ggml(self.pos_ids.dtype) + test_writer.gguf_key("pos_ids"), + self.pos_ids, + raw_dtype=np_dtype_to_ggml(self.pos_ids.dtype), ) test_writer.add_tensor( - test_writer.gguf_key("sin_table"), self.sin_table, raw_dtype=np_dtype_to_ggml(self.sin_table.dtype) + test_writer.gguf_key("sin_table"), + self.sin_table, + raw_dtype=np_dtype_to_ggml(self.sin_table.dtype), ) test_writer.add_tensor( - test_writer.gguf_key("cos_table"), self.cos_table, raw_dtype=np_dtype_to_ggml(self.cos_table.dtype) + test_writer.gguf_key("cos_table"), + self.cos_table, + raw_dtype=np_dtype_to_ggml(self.cos_table.dtype), ) ans = rotary_embedding( self.x.astype(np.float64), @@ -103,8 +125,6 @@ def write_test(self, test_writer: "InfiniopTestWriter"): ) - - if __name__ == "__main__": # ============================================================================== # Configuration (Internal Use Only) @@ -130,7 +150,9 @@ def write_test(self, test_writer: "InfiniopTestWriter"): x = np.random.rand(*shape).astype(dtype) y = np.empty(tuple(0 for _ in shape), dtype=dtype) pos_ids = np.arange(0, x.shape[0], dtype=np.int32) - sin_table, cos_table = sin_cos_table(pos_ids, x.shape[2], theta=1e5, dtype=dtype) + sin_table, cos_table = sin_cos_table( + pos_ids, x.shape[2], theta=1e5, dtype=dtype + ) test_case = RoPETestCase( y=y, x=x, diff --git a/test/infiniop-test/test_generate/testcases/swiglu.py b/test/infiniop-test/test_generate/testcases/swiglu.py index cb692b613..aa3450fed 100644 --- a/test/infiniop-test/test_generate/testcases/swiglu.py +++ b/test/infiniop-test/test_generate/testcases/swiglu.py @@ -2,7 +2,14 @@ import gguf from typing import List -from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides, contiguous_gguf_strides, process_zero_stride_tensor +from .. import ( + InfiniopTestWriter, + InfiniopTestCase, + np_dtype_to_ggml, + gguf_strides, + contiguous_gguf_strides, + process_zero_stride_tensor, +) def swiglu( @@ -26,7 +33,6 @@ def __init__( c: np.ndarray, shape_c: List[int] | None, stride_c: List[int] | None, - ): super().__init__("swiglu") self.a = a @@ -39,7 +45,6 @@ def __init__( self.shape_c = shape_c self.stride_c = stride_c - def write_test(self, test_writer: "InfiniopTestWriter"): super().write_test(test_writer) if self.shape_a is not None: @@ -47,14 +52,24 @@ def write_test(self, test_writer: "InfiniopTestWriter"): if self.shape_b is not None: test_writer.add_array(test_writer.gguf_key("b.shape"), self.shape_b) if self.shape_c is not None: - test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c) + test_writer.add_array(test_writer.gguf_key("c.shape"), self.shape_c) if self.stride_a is not None: - test_writer.add_array(test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a)) + test_writer.add_array( + test_writer.gguf_key("a.strides"), gguf_strides(*self.stride_a) + ) if self.stride_b is not None: - test_writer.add_array(test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b)) + test_writer.add_array( + test_writer.gguf_key("b.strides"), gguf_strides(*self.stride_b) + ) test_writer.add_array( test_writer.gguf_key("c.strides"), - gguf_strides(*self.stride_c if self.stride_c is not None else contiguous_gguf_strides(self.shape_c)) + gguf_strides( + *( + self.stride_c + if self.stride_c is not None + else contiguous_gguf_strides(self.shape_c) + ) + ), ) test_writer.add_tensor( test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype) diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py index 4e3ad70db..72bf31f48 100644 --- a/test/infiniop/libinfiniop/op_register.py +++ b/test/infiniop/libinfiniop/op_register.py @@ -454,6 +454,7 @@ def swiglu_(lib): infiniopOperatorDescriptor_t, ] + @OpRegister.operator def conv_(lib): lib.infiniopCreateConvDescriptor.restype = c_int32 @@ -490,6 +491,7 @@ def conv_(lib): infiniopOperatorDescriptor_t, ] + @OpRegister.operator def reduce_mean_(lib): lib.infiniopCreateReduceMeanDescriptor.restype = c_int32 @@ -522,6 +524,7 @@ def reduce_mean_(lib): infiniopOperatorDescriptor_t, ] + @OpRegister.operator def reduce_max_(lib): lib.infiniopCreateReduceMaxDescriptor.restype = c_int32 @@ -552,4 +555,4 @@ def reduce_max_(lib): lib.infiniopDestroyReduceMaxDescriptor.restype = c_int32 lib.infiniopDestroyReduceMaxDescriptor.argtypes = [ infiniopOperatorDescriptor_t, - ] \ No newline at end of file + ] diff --git a/test/infiniop/reduce_max.py b/test/infiniop/reduce_max.py index 8e27ae484..00c86f76a 100644 --- a/test/infiniop/reduce_max.py +++ b/test/infiniop/reduce_max.py @@ -25,7 +25,7 @@ _TEST_CASES_ = [ # y_shape, x_shape, y_stride, x_stride, dim ((), (), None, None, 0), - ((1, ), (32, ), None, None, 0), + ((1,), (32,), None, None, 0), ((1, 4), (1, 4), None, None, 0), ((1, 1), (1, 4), None, None, 1), ((16, 1), (16, 2048), None, None, 1), @@ -56,7 +56,7 @@ def reduce_max(x, dim): - return x.max(dim=dim,keepdim=True)[0] + return x.max(dim=dim, keepdim=True)[0] def test( @@ -77,7 +77,7 @@ def test( x = TestTensor(x_shape, x_stride, dtype, device) ans = reduce_max(x.torch_tensor(), dim) - + y = TestTensor(y_shape, y_stride, dtype, device) if sync is not None: @@ -86,7 +86,11 @@ def test( descriptor = infiniopOperatorDescriptor_t() check_error( LIBINFINIOP.infiniopCreateReduceMaxDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, ctypes.c_size_t(dim) + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + ctypes.c_size_t(dim), ) ) diff --git a/test/infiniop/reduce_mean.py b/test/infiniop/reduce_mean.py index 3b752b671..c097cd296 100644 --- a/test/infiniop/reduce_mean.py +++ b/test/infiniop/reduce_mean.py @@ -25,7 +25,7 @@ _TEST_CASES_ = [ # y_shape, x_shape, y_stride, x_stride, dim ((), (), None, None, 0), - ((1, ), (32, ), None, None, 0), + ((1,), (32,), None, None, 0), ((1, 4), (1, 4), None, None, 0), ((1, 1), (1, 4), None, None, 1), ((16, 1), (16, 2048), None, None, 1), @@ -56,7 +56,7 @@ def reduce_mean(x, dim): - return x.mean(dim=dim,keepdim=True) + return x.mean(dim=dim, keepdim=True) def test( @@ -86,7 +86,11 @@ def test( descriptor = infiniopOperatorDescriptor_t() check_error( LIBINFINIOP.infiniopCreateReduceMeanDescriptor( - handle, ctypes.byref(descriptor), y.descriptor, x.descriptor, ctypes.c_size_t(dim) + handle, + ctypes.byref(descriptor), + y.descriptor, + x.descriptor, + ctypes.c_size_t(dim), ) ) From 8cd37e3fd6563209f78a75da953ad6144712b2a6 Mon Sep 17 00:00:00 2001 From: S-hhhhh <2320230838@mail.nankai.edu.cn> Date: Sun, 24 Aug 2025 15:56:20 +0800 Subject: [PATCH 10/11] formated --- src/infiniop-test/include/ops.hpp | 33 ++++++++++++++++--------------- src/utils.h | 4 ++-- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp index bd3a3fd38..aee602aac 100644 --- a/src/infiniop-test/include/ops.hpp +++ b/src/infiniop-test/include/ops.hpp @@ -32,21 +32,22 @@ DECLARE_INFINIOP_TEST(reduce_max) /* * Register all the tests here */ -#define TEST_BUILDER_MAPPINGS \ - { \ - REGISTER_INFINIOP_TEST(gemm) \ - REGISTER_INFINIOP_TEST(random_sample) \ - REGISTER_INFINIOP_TEST(add) \ - REGISTER_INFINIOP_TEST(mul) \ - REGISTER_INFINIOP_TEST(clip) \ - REGISTER_INFINIOP_TEST(swiglu) \ - REGISTER_INFINIOP_TEST(rope) \ - REGISTER_INFINIOP_TEST(rms_norm) \ - REGISTER_INFINIOP_TEST(causal_softmax) \ - REGISTER_INFINIOP_TEST(rearrange) \ - REGISTER_INFINIOP_TEST(sub) \ - REGISTER_INFINIOP_TEST(reduce_mean) \ - REGISTER_INFINIOP_TEST(reduce_max)} +#define TEST_BUILDER_MAPPINGS \ + { \ + REGISTER_INFINIOP_TEST(gemm) \ + REGISTER_INFINIOP_TEST(random_sample) \ + REGISTER_INFINIOP_TEST(add) \ + REGISTER_INFINIOP_TEST(mul) \ + REGISTER_INFINIOP_TEST(clip) \ + REGISTER_INFINIOP_TEST(swiglu) \ + REGISTER_INFINIOP_TEST(rope) \ + REGISTER_INFINIOP_TEST(rms_norm) \ + REGISTER_INFINIOP_TEST(causal_softmax) \ + REGISTER_INFINIOP_TEST(rearrange) \ + REGISTER_INFINIOP_TEST(sub) \ + REGISTER_INFINIOP_TEST(reduce_mean) \ + REGISTER_INFINIOP_TEST(reduce_max) \ + } namespace infiniop_test { @@ -67,4 +68,4 @@ bool check_names( } // namespace infiniop_test -#endif +#endif \ No newline at end of file diff --git a/src/utils.h b/src/utils.h index af222baca..e721f05a6 100644 --- a/src/utils.h +++ b/src/utils.h @@ -98,7 +98,7 @@ inline std::string infiniDtypeToString(infiniDtype_t dtype) { } } -#define CEIL_DIV(x, y) (((x) + (y) - 1) / (y)) +#define CEIL_DIV(x, y) (((x) + (y)-1) / (y)) namespace utils { @@ -108,4 +108,4 @@ inline size_t align(size_t size, size_t alignment) { } // namespace utils -#endif +#endif \ No newline at end of file From 4d68243d644c9b564d67383c7edd54f97109bf84 Mon Sep 17 00:00:00 2001 From: S-hhhhh <2320230838@mail.nankai.edu.cn> Date: Sun, 24 Aug 2025 22:46:52 +0800 Subject: [PATCH 11/11] fix: windows build --- .../ops/reduce_max/cpu/reduce_max_cpu.cc | 34 ++++++++++++++++--- .../ops/reduce_mean/cpu/reduce_mean_cpu.cc | 34 ++++++++++++++++--- 2 files changed, 58 insertions(+), 10 deletions(-) diff --git a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc index 5ed28d75e..55a340226 100644 --- a/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc +++ b/src/infiniop/ops/reduce_max/cpu/reduce_max_cpu.cc @@ -20,11 +20,7 @@ infiniStatus_t Descriptor::create( template infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) { - const size_t batch_size = info->shape[0]; - const size_t channels = info->shape[1]; - const size_t rows = info->shape[2]; const size_t cols = info->shape[3]; // 规约维度 - const ptrdiff_t y_batch_stride = info->y_strides[0]; const ptrdiff_t y_channel_stride = info->y_strides[1]; const ptrdiff_t y_row_stride = info->y_strides[2]; @@ -32,7 +28,35 @@ infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) { const ptrdiff_t x_channel_stride = info->x_strides[1]; const ptrdiff_t x_row_stride = info->x_strides[2]; const ptrdiff_t x_col_stride = info->x_strides[3]; +#ifdef _WIN32 + const ptrdiff_t batch_size = static_cast(info->shape[0]); + const ptrdiff_t channels = static_cast(info->shape[1]); + const ptrdiff_t rows = static_cast(info->shape[2]); +#pragma omp parallel for + for (ptrdiff_t batch = 0; batch < batch_size; ++batch) { + for (ptrdiff_t channel = 0; channel < channels; ++channel) { + for (ptrdiff_t row = 0; row < rows; ++row) { + const T *input_start = x + batch * x_batch_stride + + channel * x_channel_stride + + row * x_row_stride; + T *output_ptr = y + batch * y_batch_stride + + channel * y_channel_stride + + row * y_row_stride; + + float max = op::common_cpu::reduce_op::max(input_start, cols, x_col_stride); + if constexpr (std::is_same::value || std::is_same::value) { + *output_ptr = utils::cast(max); + } else { + *output_ptr = max; + } + } + } + } +#else + const size_t batch_size = info->shape[0]; + const size_t channels = info->shape[1]; + const size_t rows = info->shape[2]; #pragma omp parallel for collapse(3) for (size_t batch = 0; batch < batch_size; ++batch) { for (size_t channel = 0; channel < channels; ++channel) { @@ -54,7 +78,7 @@ infiniStatus_t reduce_max(const ReduceMaxInfo *info, T *y, const T *x) { } } } - +#endif //_WIN32 return INFINI_STATUS_SUCCESS; } infiniStatus_t Descriptor::calculate( diff --git a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc index 5eb2b5419..7853afdbb 100644 --- a/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc +++ b/src/infiniop/ops/reduce_mean/cpu/reduce_mean_cpu.cc @@ -20,11 +20,7 @@ infiniStatus_t Descriptor::create( template infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) { - const size_t batch_size = info->shape[0]; - const size_t channels = info->shape[1]; - const size_t rows = info->shape[2]; const size_t cols = info->shape[3]; // 规约维度 - const ptrdiff_t y_batch_stride = info->y_strides[0]; const ptrdiff_t y_channel_stride = info->y_strides[1]; const ptrdiff_t y_row_stride = info->y_strides[2]; @@ -32,7 +28,35 @@ infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) { const ptrdiff_t x_channel_stride = info->x_strides[1]; const ptrdiff_t x_row_stride = info->x_strides[2]; const ptrdiff_t x_col_stride = info->x_strides[3]; +#ifdef _WIN32 + const ptrdiff_t batch_size = static_cast(info->shape[0]); + const ptrdiff_t channels = static_cast(info->shape[1]); + const ptrdiff_t rows = static_cast(info->shape[2]); +#pragma omp parallel for + for (ptrdiff_t batch = 0; batch < batch_size; ++batch) { + for (ptrdiff_t channel = 0; channel < channels; ++channel) { + for (ptrdiff_t row = 0; row < rows; ++row) { + const T *input_start = x + batch * x_batch_stride + + channel * x_channel_stride + + row * x_row_stride; + T *output_ptr = y + batch * y_batch_stride + + channel * y_channel_stride + + row * y_row_stride; + + float mean = op::common_cpu::reduce_op::sum(input_start, cols, x_col_stride) / cols; + if constexpr (std::is_same::value || std::is_same::value) { + *output_ptr = utils::cast(mean); + } else { + *output_ptr = mean; + } + } + } + } +#else + const size_t batch_size = info->shape[0]; + const size_t channels = info->shape[1]; + const size_t rows = info->shape[2]; #pragma omp parallel for collapse(3) for (size_t batch = 0; batch < batch_size; ++batch) { for (size_t channel = 0; channel < channels; ++channel) { @@ -54,7 +78,7 @@ infiniStatus_t reduce_mean(const ReduceMeanInfo *info, T *y, const T *x) { } } } - +#endif //_WIN32 return INFINI_STATUS_SUCCESS; } infiniStatus_t Descriptor::calculate(