From 3d3308c43b194bc37a01fa0a6ad94273f3c5368b Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Wed, 15 May 2024 10:07:35 +0800 Subject: [PATCH 01/29] move codes from dnn-compiler --- .../DataFlow/ConstantSubgraphAnalysis.h | 127 +++++ .../Dialect/OnednnGraph/OnednnGraphDialect.td | 1 + include/gc/Transforms/Passes.h | 5 + include/gc/Transforms/Passes.td | 16 + lib/gc/Analysis/CMakeLists.txt | 16 + .../DataFlow/ConstantSubgraphAnalysis.cpp | 180 +++++++ lib/gc/CMakeLists.txt | 1 + .../OnednnGraph/OnednnGraphDialect.cpp | 6 + lib/gc/Transforms/CMakeLists.txt | 2 + lib/gc/Transforms/CSA.cpp | 51 ++ lib/gc/Transforms/CST.cpp | 496 ++++++++++++++++++ src/gc-opt/CMakeLists.txt | 3 +- .../test_constant_weights_folding.mlir | 76 +++ 13 files changed, 979 insertions(+), 1 deletion(-) create mode 100644 include/gc/Analysis/DataFlow/ConstantSubgraphAnalysis.h create mode 100644 lib/gc/Analysis/CMakeLists.txt create mode 100644 lib/gc/Analysis/DataFlow/ConstantSubgraphAnalysis.cpp create mode 100644 lib/gc/Transforms/CSA.cpp create mode 100644 lib/gc/Transforms/CST.cpp create mode 100644 test/gc/Transforms/test_constant_weights_folding.mlir diff --git a/include/gc/Analysis/DataFlow/ConstantSubgraphAnalysis.h b/include/gc/Analysis/DataFlow/ConstantSubgraphAnalysis.h new file mode 100644 index 000000000..fcb2939d8 --- /dev/null +++ b/include/gc/Analysis/DataFlow/ConstantSubgraphAnalysis.h @@ -0,0 +1,127 @@ +//===- ConstantSubgraphAnalysis.h - Constant subgraph analysis ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements constant subgraph analysis. In this file are: +// 1. the lattice value class that represents operations with constant inputs +// and outputs in the program, and +// 2. a sparse constant subgraph analysis. +// +//===----------------------------------------------------------------------===// + +#ifndef MLIR_ANALYSIS_DATAFLOW_CONSTANTSUBGRAPHANALYSIS_H +#define MLIR_ANALYSIS_DATAFLOW_CONSTANTSUBGRAPHANALYSIS_H + +#include "mlir/Analysis/DataFlow/SparseAnalysis.h" +#include + +namespace mlir { +namespace dataflow { + +//===----------------------------------------------------------------------===// +// InConstantSubgraph +//===----------------------------------------------------------------------===// + +/// This lattice represents a boolean integer indicating if an operation is with +/// constant inputs and constant outputs and hence in constant subgraph. +class InConstantSubgraph { +public: + /// Construct as uninitialized. + explicit InConstantSubgraph() = default; + + /// Construct with a known state. + explicit InConstantSubgraph(bool initialized, bool inConstantSubgraph) + : initialized(initialized), inConstantSubgraph(inConstantSubgraph) {} + + /// Get the state. Returns null if no value was determined. + bool getInConstantSubgraph() const { + assert(!isUninitialized()); + return inConstantSubgraph; + } + + /// Compare. + bool operator==(const InConstantSubgraph &rhs) const { + return initialized == rhs.initialized && + inConstantSubgraph == rhs.inConstantSubgraph; + } + + void print(raw_ostream &os) const; + + /// Get uninitialized state. This happens when the + /// state hasn't been set during the analysis. + static InConstantSubgraph getUninitialized() { return InConstantSubgraph{}; } + + /// Whether the state is uninitialized. + bool isUninitialized() const { return !initialized; } + + /// Get unknown state. + static InConstantSubgraph getUnknown() { + return InConstantSubgraph{/*initialized=*/false, + /*inConstantSubgraph=*/false}; + } + + // Join two states. + static InConstantSubgraph join(const InConstantSubgraph &lhs, + const InConstantSubgraph &rhs) { + // if one is uninitialized, use another + if (lhs.isUninitialized()) + return rhs; + if (rhs.isUninitialized()) + return lhs; + + // both are initialized, intersect them + if (!lhs.isUninitialized() && !rhs.isUninitialized()) { + return InConstantSubgraph(true, lhs.getInConstantSubgraph() && + rhs.getInConstantSubgraph()); + } + return getUninitialized(); + } + +private: + bool initialized = false; + bool inConstantSubgraph = false; +}; + +//===----------------------------------------------------------------------===// +// ConstantSubgraphAnalysis +//===----------------------------------------------------------------------===// + +class ConstantSubgraphAnalysis + : public SparseForwardDataFlowAnalysis> { +public: + using SparseForwardDataFlowAnalysis::SparseForwardDataFlowAnalysis; + + void visitOperation(Operation *op, + ArrayRef *> operands, + ArrayRef *> results) override; + + void setToEntryState(Lattice *lattice) override; +}; + +//===----------------------------------------------------------------------===// +// RunConstantSubgraphAnalysis +//===----------------------------------------------------------------------===// + +/// Runs constant subgraph analysis on the IR defined by `op`. +struct RunConstantSubgraphAnalysis { +public: + RunConstantSubgraphAnalysis(); + + void run(Operation *op); + + bool getInConstantSubgraph(Value val); + +private: + /// Stores the result of the analysis. + DataFlowSolver solver; + + void getConstantSubgraph(DataFlowSolver &solver, Operation *topFunc); +}; +} // end namespace dataflow +} // end namespace mlir + +#endif // MLIR_ANALYSIS_DATAFLOW_CONSTANTSUBGRAPHANALYSIS_H diff --git a/include/gc/Dialect/OnednnGraph/OnednnGraphDialect.td b/include/gc/Dialect/OnednnGraph/OnednnGraphDialect.td index 16615a4d3..1f6fbe77b 100644 --- a/include/gc/Dialect/OnednnGraph/OnednnGraphDialect.td +++ b/include/gc/Dialect/OnednnGraph/OnednnGraphDialect.td @@ -24,6 +24,7 @@ def OnednnGraphDialect : Dialect { let cppNamespace = "::mlir::onednn_graph"; let useDefaultTypePrinterParser = 1; + let hasOperationAttrVerify = 1; } #endif // ONEDNNGRAPH_DIALECT diff --git a/include/gc/Transforms/Passes.h b/include/gc/Transforms/Passes.h index 243a6f4f6..34d2fd487 100644 --- a/include/gc/Transforms/Passes.h +++ b/include/gc/Transforms/Passes.h @@ -15,8 +15,13 @@ namespace mlir { namespace gc { #define GEN_PASS_DECL +#define GEN_PASS_DECL_CSA +#define GEN_PASS_DECL_CST #include "gc/Transforms/Passes.h.inc" +std::unique_ptr createCSAPass(); +std::unique_ptr createCSTPass(); + #define GEN_PASS_REGISTRATION #include "gc/Transforms/Passes.h.inc" } // namespace gc diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td index 7274534b7..23593ded3 100644 --- a/include/gc/Transforms/Passes.td +++ b/include/gc/Transforms/Passes.td @@ -17,4 +17,20 @@ def TileLinalgNamed : Pass<"tile-named-linalg", "func::FuncOp"> { ["linalg::LinalgDialect", "scf::SCFDialect", "tensor::TensorDialect"]; } +def CSA : Pass<"csa"> { + let summary = "Constant Subgraph Analysis"; + let description = [{ + This pass implements a constant subgraph analysis. + }]; + let constructor = "mlir::gc::createCSAPass()"; +} + +def CST : Pass<"cst"> { + let summary = "Constant Subgraph Transform"; + let description = [{ + This pass implements a constant subgraph transform. + }]; + let constructor = "mlir::gc::createCSTPass()"; +} + #endif // GC_DIALECT_GC_PASSES diff --git a/lib/gc/Analysis/CMakeLists.txt b/lib/gc/Analysis/CMakeLists.txt new file mode 100644 index 000000000..42c3d5541 --- /dev/null +++ b/lib/gc/Analysis/CMakeLists.txt @@ -0,0 +1,16 @@ +add_mlir_library(GCAnalysis + DataFlow/ConstantSubgraphAnalysis.cpp + + ADDITIONAL_HEADER_DIRS + ${PROJECT_SOURCE_DIR}/include/ + + DEPENDS + GraphCompilerPassIncGen + + LINK_LIBS PUBLIC + ${mlir_dialect_libs} + MLIRIR + MLIRSupport + MLIRBufferizationToMemRef + MLIRBufferizationPipelines + ) diff --git a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalysis.cpp b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalysis.cpp new file mode 100644 index 000000000..2de9e5b4a --- /dev/null +++ b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalysis.cpp @@ -0,0 +1,180 @@ +//===- ConstantSubgraphAnalysis.cpp - Constant subgraph analysis ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "gc/Analysis/DataFlow/ConstantSubgraphAnalysis.h" +#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" +#include "mlir/Analysis/DataFlow/SparseAnalysis.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/SCF/IR/SCF.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Dialect.h" +#include "mlir/IR/OpDefinition.h" +#include "mlir/IR/Operation.h" +#include "mlir/IR/Value.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" +#include "mlir/Support/LogicalResult.h" +#include "mlir/Transforms/Passes.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include + +#define DEBUG_TYPE "in-constant-subgraph" + +using namespace mlir; +using namespace mlir::dataflow; + +//===----------------------------------------------------------------------===// +// InConstantSubgraph +//===----------------------------------------------------------------------===// + +void InConstantSubgraph::print(raw_ostream &os) const { + if (isUninitialized()) { + os << ""; + return; + } + os << getInConstantSubgraph(); + return; +} + +//===----------------------------------------------------------------------===// +// ConstantSubgraphAnalysis +//===----------------------------------------------------------------------===// + +void ConstantSubgraphAnalysis::visitOperation( + Operation *op, ArrayRef *> operands, + ArrayRef *> results) { + LLVM_DEBUG(llvm::dbgs() << "ConstantSubgraphAnalysis: Visiting operation:\n" + << *op << "\n"); + + bool in = true; + if (op->hasTrait()) { + LLVM_DEBUG(llvm::dbgs() << "Curr op is a Constant op\n"); + in = true; + } else if (operands.size() == 0) { // For example, tensor.empty() + LLVM_DEBUG(llvm::dbgs() << "Curr op has 0 operand, constant\n"); + in = true; + } else { + LLVM_DEBUG(llvm::dbgs() << "Curr op has " << operands.size() + << " operands, check if constant\n"); + for (auto *operandLattice : operands) { + auto operandState = operandLattice->getValue().getInConstantSubgraph(); + LLVM_DEBUG(llvm::dbgs() << "Operand: " << operandLattice->getPoint() + << ", lattice value: " << operandState << "\n"); + if (!operandState) { + in = false; + break; + } + } + } + + // lattice in results should be in unintialized state. + if (!in) { + LLVM_DEBUG(llvm::dbgs() << "Curr op not in constant subgraph\n"); + for (auto lattice : results) { + propagateIfChanged(lattice, + lattice->join(InConstantSubgraph(true, false))); + } + } else { + LLVM_DEBUG(llvm::dbgs() << "Curr op in constant subgraph\n"); + for (auto lattice : results) { + propagateIfChanged(lattice, + lattice->join(InConstantSubgraph(true, true))); + } + } +} + +void ConstantSubgraphAnalysis::setToEntryState( + Lattice *lattice) { + if (auto blockArg = cast(lattice->getPoint())) { + auto parent_op = blockArg.getParentBlock()->getParentOp(); + auto parent_op_attr = parent_op->getAttrDictionary(); + std::optional const_args = + parent_op_attr.getNamed("onednn_graph.const_args"); + if (const_args.has_value()) { + ArrayAttr const_args_indexes = + llvm::dyn_cast(const_args->getValue()); + for (auto id : const_args_indexes) { + auto idint = llvm::cast(id).getInt(); + if (blockArg.getArgNumber() == idint) { + LLVM_DEBUG(llvm::dbgs() << "Block argument: " << blockArg + << " is marked as constant\n"); + propagateIfChanged(lattice, + lattice->join(InConstantSubgraph(true, true))); + return; + } + } + } + propagateIfChanged(lattice, lattice->join(InConstantSubgraph(true, false))); + } else { + propagateIfChanged(lattice, + lattice->join(InConstantSubgraph::getUninitialized())); + } +} + +//===----------------------------------------------------------------------===// +// RunConstantSubgraphAnalysis +//===----------------------------------------------------------------------===// + +/// Get the operations whose inputs and outputs are all constant values. +/// These operations will be put into a seperate subgraph. +void RunConstantSubgraphAnalysis::getConstantSubgraph(DataFlowSolver &solver, + Operation *topFunc) { + OpBuilder builder(topFunc->getContext()); + SmallVector constantOperations; + + Block &block = topFunc->getRegions().front().getBlocks().front(); + for (Operation &op : llvm::make_early_inc_range(block)) { + // If all the result values of a op are const, we mark this op as const. + bool resultsAllConstant = true; + if (op.getNumResults() == 0) { + continue; + } + for (Value res : op.getResults()) { + auto *lattice = solver.lookupState>(res); + if (!lattice || lattice->getValue().isUninitialized()) { + resultsAllConstant = false; + break; + } + const InConstantSubgraph &latticeValue = lattice->getValue(); + if (!latticeValue.getInConstantSubgraph()) { + resultsAllConstant = false; + break; + } + } + if (resultsAllConstant) { + op.setAttr("onednn_graph.in_const_subgraph", builder.getBoolAttr(true)); + constantOperations.push_back(&op); + } + } + + if (constantOperations.empty()) { + return; + } +} + +RunConstantSubgraphAnalysis::RunConstantSubgraphAnalysis() { + solver.load(); + solver.load(); +} + +void RunConstantSubgraphAnalysis::run(Operation *topFunc) { + if (failed(solver.initializeAndRun(topFunc))) { + return; + } + getConstantSubgraph(solver, topFunc); +} + +bool RunConstantSubgraphAnalysis::getInConstantSubgraph(Value val) { + auto *lattice = solver.lookupState>(val); + const InConstantSubgraph &latticeValue = lattice->getValue(); + return latticeValue.getInConstantSubgraph(); +} \ No newline at end of file diff --git a/lib/gc/CMakeLists.txt b/lib/gc/CMakeLists.txt index f5ed3a6e5..308db5f30 100644 --- a/lib/gc/CMakeLists.txt +++ b/lib/gc/CMakeLists.txt @@ -3,4 +3,5 @@ if(GC_MLIR_CXX_FLAGS) endif() add_subdirectory(Dialect) +add_subdirectory(Analysis) add_subdirectory(Transforms) \ No newline at end of file diff --git a/lib/gc/Dialect/OnednnGraph/OnednnGraphDialect.cpp b/lib/gc/Dialect/OnednnGraph/OnednnGraphDialect.cpp index 434fa8a57..6469cc12a 100644 --- a/lib/gc/Dialect/OnednnGraph/OnednnGraphDialect.cpp +++ b/lib/gc/Dialect/OnednnGraph/OnednnGraphDialect.cpp @@ -18,3 +18,9 @@ void OnednnGraphDialect::initialize() { #include "gc/Dialect/OnednnGraph/OnednnGraphOps.cpp.inc" >(); } + +LogicalResult +OnednnGraphDialect::verifyOperationAttribute(Operation *op, + NamedAttribute attr) { + return success(); +} \ No newline at end of file diff --git a/lib/gc/Transforms/CMakeLists.txt b/lib/gc/Transforms/CMakeLists.txt index df8a14d01..6421d20c2 100644 --- a/lib/gc/Transforms/CMakeLists.txt +++ b/lib/gc/Transforms/CMakeLists.txt @@ -1,5 +1,7 @@ add_mlir_library(GCPasses TileNamed.cpp + CSA.cpp + CST.cpp ADDITIONAL_HEADER_DIRS ${PROJECT_SOURCE_DIR}/include diff --git a/lib/gc/Transforms/CSA.cpp b/lib/gc/Transforms/CSA.cpp new file mode 100644 index 000000000..5175be2f5 --- /dev/null +++ b/lib/gc/Transforms/CSA.cpp @@ -0,0 +1,51 @@ +//===- CSA.cpp - Constant Subgraph Analysis -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This transformation pass performs a constant subgraph analysis +// in MLIR. +// +//===----------------------------------------------------------------------===// +#include "gc/Analysis/DataFlow/ConstantSubgraphAnalysis.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Dialect.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/Passes.h" + +namespace mlir { +namespace gc { +#define GEN_PASS_DEF_CSA +#include "gc/Transforms/Passes.h.inc" +} // namespace gc + +using namespace mlir; +using namespace mlir::dataflow; + +namespace gc { + +struct CSA : public impl::CSABase { + void runOnOperation() override; +}; + +void CSA::runOnOperation() { + Operation *op = getOperation(); + auto &func = + op->getRegions().front().getBlocks().front().getOperations().front(); + + // Hard-code: set the #1 argument to be constant. + // OpBuilder builder(op->getContext()); + // func.setAttr("onednn_graph.const_args", + // builder.getI32ArrayAttr({1,2,3,4})); + + RunConstantSubgraphAnalysis csa; + (void)csa.run(&func); +} + +std::unique_ptr createCSAPass() { return std::make_unique(); } + +} // namespace gc +} // namespace mlir \ No newline at end of file diff --git a/lib/gc/Transforms/CST.cpp b/lib/gc/Transforms/CST.cpp new file mode 100644 index 000000000..2dac0d860 --- /dev/null +++ b/lib/gc/Transforms/CST.cpp @@ -0,0 +1,496 @@ +//===- CST.cpp - Constant Subgraph Transform -----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This transformation pass performs a constant subgraph transform in MLIR. +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "mlir/Transforms/Passes.h" + +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" +#include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" +#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" +#include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/Tensor/IR/Tensor.h" +#include "mlir/IR/Builders.h" +#include "mlir/IR/Dialect.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/GreedyPatternRewriteDriver.h" +#include "llvm/Support/Debug.h" + +namespace mlir { +namespace gc { +#define GEN_PASS_DEF_CST +#include "gc/Transforms/Passes.h.inc" +} // namespace gc + +using namespace mlir; + +namespace gc { + +struct CST : public impl::CSTBase { + void runOnOperation() override; +}; + +bool isInConstantSubgraph(Operation *op) { + auto opNamespace = op->getDialect()->getNamespace(); + if (opNamespace == linalg::LinalgDialect::getDialectNamespace() || + opNamespace == tensor::TensorDialect::getDialectNamespace() || + opNamespace == arith::ArithDialect::getDialectNamespace()) { + if (op->getAttr("onednn_graph.in_const_subgraph")) { + return true; + } + } + return false; +} + +int64_t getTensorSize(TensorType t) { + Type eleType = t.getElementType(); + unsigned bitWidth = eleType.getIntOrFloatBitWidth() / 8; // bytes + ArrayRef shape = t.getShape(); + int64_t size = bitWidth; + for (auto s : shape) { + size *= s; + } + return size; +} + +bool canMoveBefore(Operation *op) { + if (op->getDialect()->getNamespace() == + arith::ArithDialect::getDialectNamespace()) { + return true; + } + + if (op->getDialect()->getNamespace() != + linalg::LinalgDialect::getDialectNamespace()) { + return false; + } + + auto linalgOp = dyn_cast(op); + + SmallVector indexingMaps = linalgOp.getIndexingMapsArray(); + for (auto &affineMap : indexingMaps) { + if (!affineMap.isIdentity()) { + return false; + } + } + + SmallVector iterTypes = linalgOp.getIteratorTypesArray(); + for (auto &iterType : iterTypes) { + if (iterType != utils::IteratorType::parallel) { + return false; + } + } + + if (op->getNumOperands() > 1) { + // int64_t numInputs = linalgOp.getNumDpsInputs(); + int64_t numInits = linalgOp.getNumDpsInits(); + // definingOp of init should be tensor.empty() + for (int64_t i = 0; i < numInits; ++i) { + OpOperand *outOperand = linalgOp.getDpsInitOperand(i); + auto parentOp = outOperand->get().getDefiningOp(); + if (!isa(parentOp)) { + return false; + } + } + } + + return true; +} + +void postponeBroadcast(Block &block) { + // auto bcOps = block.getOps(); + // for (linalg::BroadcastOp bcOp : bcOps) {} + SmallVector constBcOps; + for (Operation &op : block.getOperations()) { + if (isa(&op)) { + Operation *bcOp = &op; + if (isInConstantSubgraph(bcOp)) { + constBcOps.push_back(bcOp); + } + } + } + + for (auto bcOp : constBcOps) { + // For topo v -> pack -> bc -> mul -> matmul, we transform + // it to v -> pack -> mul -> bc -> matmul, so that we can fold + // v -> pack -> mul. Note that we require the topo to be sequential + // and all the Values have exactly one user. + + // go upwards to BlockArg + SmallVector prevOps; + Operation *currOp = bcOp; + while (true) { + if (currOp->getNumOperands() != 1) { + break; + } + Value operand = currOp->getOperand(0); + if (isa(operand)) { + break; + } else { + currOp = operand.getDefiningOp(); + prevOps.push_back(currOp); + } + } + + // go downwards to the last constant op + SmallVector postOps; + currOp = bcOp; + while (true) { + if (currOp->getNumResults() != 1 || !currOp->hasOneUse()) { + break; + } + Value input = currOp->getResult(0); + currOp = *(input.getUsers().begin()); + Value output = currOp->getResult(0); + // NOTE: we require that input shape and output shape of curr op to be + // same. Operations from tensor dialect, like + // pack/unpack/concat/collapse_shape/expand_shape/reshape/pad, are not + // supported. So we simply restrict that currOp to be from arith or + // linalg. + if (!isa(input.getType()) || + !isa(output.getType()) || + dyn_cast(input.getType()).getShape() != + dyn_cast(output.getType()).getShape() || + !canMoveBefore(currOp)) { + break; + } + if (!isInConstantSubgraph(currOp)) { + break; + } else { + postOps.push_back(currOp); + } + } + if (postOps.empty()) { + continue; + } + + // move bcOp after the last constant op + SmallVector newPostOps; + Value operand = static_cast(bcOp->getOperand(0)); + ArrayRef shapeBeforeBc = + dyn_cast(operand.getType()).getShape(); + size_t postOpId = 0; + for (Operation *postOp : postOps) { + SmallVector newOperandTypes; + for (auto oriType : postOp->getOperandTypes()) { + TensorType tt = dyn_cast(oriType); + newOperandTypes.push_back( + tt.cloneWith(shapeBeforeBc, tt.getElementType())); + } + SmallVector newResultTypes; + for (auto oriType : postOp->getResultTypes()) { + TensorType tt = dyn_cast(oriType); + newResultTypes.push_back( + tt.cloneWith(shapeBeforeBc, tt.getElementType())); + } + auto *newPostOp = + Operation::create(postOp->getLoc(), postOp->getName(), newResultTypes, + postOp->getOperands(), + /*postOp->getAttrDictionary()*/ std::nullopt, + /*postOp->getPropertiesStorage()*/ nullptr, + postOp->getSuccessors(), postOp->getNumRegions()); + for (auto [oldRegion, newRegion] : + llvm::zip(postOp->getRegions(), newPostOp->getRegions())) { + newRegion.takeBody(oldRegion); + } + + if (postOpId == 0) { + // Only the first post op needs to replace its operand. Others only + // needs to call postOp->replaceAllUsesWith(newPostOp->getResults()). + newPostOp->getOperand(0).replaceAllUsesWith(operand); + } + ++postOpId; + + newPostOp->setAttr("onednn_graph.in_const_subgraph", + postOp->getAttr("onednn_graph.in_const_subgraph")); + if (postOp->getDialect()->getNamespace() == + linalg::LinalgDialect::getDialectNamespace()) { + newPostOp->setAttr("operandSegmentSizes", + postOp->getAttr("operandSegmentSizes")); + + OpBuilder builder(postOp->getContext()); + size_t indexingMapsSize = + dyn_cast(postOp).getIndexingMapsArray().size(); + unsigned rank = shapeBeforeBc.size(); + SmallVector indexingMaps( + indexingMapsSize, builder.getMultiDimIdentityMap(rank)); + auto indexingMapsAttr = builder.getAffineMapArrayAttr(indexingMaps); + newPostOp->setAttr("indexing_maps", indexingMapsAttr); + + SmallVector iterTypes = + dyn_cast(postOp).getIteratorTypesArray(); + iterTypes.resize(rank); + auto iterTypesAttr = + builder.getArrayAttr(llvm::to_vector(llvm::map_range( + iterTypes, [&](utils::IteratorType iter) -> mlir::Attribute { + return linalg::IteratorTypeAttr::get(builder.getContext(), + iter); + }))); + newPostOp->setAttr("iterator_types", iterTypesAttr); + } else { + // Ops from other dialects. + } + + // Modify the outputOperands of postOp. Here we simply assume that the + // value is from tensor.empty(). + if (postOp->getNumOperands() > 0) { + for (size_t i = 1; i < postOp->getNumOperands(); ++i) { + auto outOperand = postOp->getOperand(i); + outOperand.setType(newOperandTypes.front()); + } + } + + block.getOperations().push_back(newPostOp); + newPostOp->moveAfter(postOp); + newPostOps.push_back(newPostOp); + postOp->replaceAllUsesWith(newPostOp->getResults()); + + operand = static_cast(newPostOp->getResult(0)); + } + + auto nextOp = *(newPostOps.back()->getUsers().begin()); + nextOp->getOperand(0).replaceAllUsesWith(bcOp->getResult(0)); + bcOp->moveAfter(newPostOps.back()); + bcOp->getOperand(0).replaceUsesWithIf(operand, [&](OpOperand &val) { + Operation *op = val.getOwner(); + return op == bcOp; + }); + + for (auto it = postOps.rbegin(); it != postOps.rend(); ++it) { + (*it)->erase(); + } + } +} + +static constexpr int DATA_SIZE_EXPANDING_THRESHOLD = 8; + +// Operate on tensors. Create fold() and compute() on module. The +// folded weights and first-run flag is maintained by upper-level runtime. +void CST::runOnOperation() { + Operation *topOp = getOperation(); + MLIRContext *context = topOp->getContext(); + // A ModuleOp contains a single region, which contains a single block. + auto moduleOp = dyn_cast(topOp); + SymbolTable symbolTable(moduleOp); + auto &topFunc = + topOp->getRegions().front().getBlocks().front().getOperations().front(); + OpBuilder builder(context); + + auto topFuncAttr = topFunc.getAttrDictionary(); + std::optional constArgs = + topFuncAttr.getNamed("onednn_graph.const_args"); + std::unordered_set constArgsIndexes; + if (constArgs.has_value()) { + ArrayAttr constArgsArray = llvm::dyn_cast(constArgs->getValue()); + for (auto id : constArgsArray) { + constArgsIndexes.insert(llvm::cast(id).getInt()); + } + } else { + return; + } + if (constArgsIndexes.empty()) { + return; + } + + Region ®ion = topFunc.getRegions().front(); + Block &block = region.getBlocks().front(); + + postponeBroadcast(block); + + SmallVector constOps; + for (Operation &op : llvm::make_early_inc_range(block)) { + if (isInConstantSubgraph(&op)) { + constOps.push_back(&op); + } + } + + std::string funcName("fold"); + SmallVector inputTypes; // types of constant weights + // values of constant weights in original block + SmallVector inputValues; + SmallVector outputTypes; // types of folded constant weights + // values of folded constant weights in original block + SmallVector outputValues; + Value v; + // TODO: solve complicated topology. Currently we only handle simple topology + // where one constant weight input will and only will produce one constant + // output and each constant weight only contributes to one constant output. + for (size_t id = 0; id < block.getNumArguments(); ++id) { + if (constArgsIndexes.count(id) == 1) { + auto arg = block.getArgument(id); + if (!isa(arg.getType())) { + continue; + } + inputTypes.push_back(arg.getType()); + v = dyn_cast(arg); + inputValues.push_back(v); + SmallVector valuesOnTheWay = {v}; // the constant tensors + // For v -> pack1 -> pack2 -> matmul, we need the type of output of pack2 + while (!v.getUsers().empty()) { + // v.getUsers().size() should be 1 + Operation *user = *(v.getUsers().begin()); + if (!isInConstantSubgraph(user)) { + outputTypes.push_back(v.getType()); + outputValues.push_back(v); + break; + } + // user should has only 1 output value + OpResult result = *(user->result_begin()); + v = dyn_cast(result); + valuesOnTheWay.push_back(v); + } + + // If data size of outputValue is too greater than size of inputValue, do + // not fold it. Compare data size changes during traverse to find the last + // op that satisfies this condition. + int64_t initSize = + getTensorSize(dyn_cast(valuesOnTheWay[0].getType())); + if (!isa(outputTypes.back()) || + initSize * DATA_SIZE_EXPANDING_THRESHOLD < + getTensorSize(dyn_cast(outputTypes.back()))) { + size_t lastIdx = 0; + for (size_t i = 1; i < valuesOnTheWay.size(); ++i) { + int64_t size = + getTensorSize(dyn_cast(valuesOnTheWay[i].getType())); + if (initSize * DATA_SIZE_EXPANDING_THRESHOLD > size) { + lastIdx = i; + } + } + if (lastIdx == 0) { // no suitable value found + inputTypes.pop_back(); + outputTypes.pop_back(); + inputValues.pop_back(); + outputValues.pop_back(); + constArgsIndexes.erase(id); + } else { + outputTypes.back() = valuesOnTheWay[lastIdx].getType(); + outputValues.back() = valuesOnTheWay[lastIdx]; + } + } + } + } + if (inputTypes.size() != outputTypes.size()) { + return; + } + + FunctionType foldFuncType = + FunctionType::get(context, inputTypes, outputTypes); + auto foldFunc = + builder.create(topFunc.getLoc(), funcName, foldFuncType); + Block *foldBlock = foldFunc.addEntryBlock(); + // values of folded constant weights in foldBlock + SmallVector outputValuesInFold; + IRMapping mapper; + for (Operation *op : constOps) { + foldBlock->getOperations().push_back(op->clone(mapper)); + } + // the order of outputValuesInFold is according to the order of corresponding + // inputValues + for (auto &v : outputValues) { + auto foldedV = mapper.lookupOrNull(v); + outputValuesInFold.push_back(foldedV); + v.replaceUsesWithIf(foldedV, [&](OpOperand &val) { + Operation *op = val.getOwner(); + return op->getBlock() == foldBlock; + }); + } + + auto returnOp = + builder.create(topOp->getLoc(), outputValuesInFold); + foldBlock->getOperations().push_back(returnOp); + for (size_t i = 0; i < inputValues.size(); ++i) { + inputValues[i].replaceUsesWithIf(foldBlock->getArgument(i), + [&](OpOperand &val) { + Operation *op = val.getOwner(); + return op->getBlock() == foldBlock; + }); + } + + foldFunc.setVisibility(SymbolTable::Visibility::Public); + moduleOp.push_back(foldFunc); + symbolTable.insert(foldFunc); + + // modify the BlockArguments of block + size_t oriNumArgs = block.getNumArguments(); + size_t argIdx = 0; + for (size_t id = 0; id < oriNumArgs; ++id) { + if (constArgsIndexes.count(id) == 1) { + auto loc = block.getArgument(id).getLoc(); + BlockArgument foldArg = + block.insertArgument(id, outputTypes[argIdx], loc); + outputValues[argIdx].replaceUsesWithIf(foldArg, [&](OpOperand &val) { + Operation *op = val.getOwner(); + return op->getBlock() == █ + }); + + std::deque dq; + SmallVector opsToErase; + dq.push_back(block.getArgument(id + 1)); + while (!dq.empty()) { + Value v = dq.front(); + dq.pop_front(); + for (Operation *op : v.getUsers()) { + for (auto res : op->getResults()) { + dq.push_back(res); + } + opsToErase.push_back(op); + } + } + + for (auto it = opsToErase.rbegin(); it != opsToErase.rend(); ++it) { + (*it)->erase(); + } + block.eraseArgument(id + 1); + ++argIdx; + } + } + + // modify the compute func signature + func::FuncOp computeFunc = cast(topFunc); + FunctionType computeFuncType = computeFunc.getFunctionType(); + computeFunc.setType(FunctionType::get(context, block.getArgumentTypes(), + computeFuncType.getResults())); + + // Delete dead operations by dialects' canonicalizer + RewritePatternSet owningPatterns(context); + for (auto *dialect : context->getLoadedDialects()) + dialect->getCanonicalizationPatterns(owningPatterns); + + ArrayRef disabledPatterns, enabledPatterns; + std::shared_ptr patterns = + std::make_shared( + std::move(owningPatterns), disabledPatterns, enabledPatterns); + GreedyRewriteConfig config; + LogicalResult converged = + applyPatternsAndFoldGreedily(topOp, *patterns, config); + (void)converged; + + // clean up the constant-related attrs on ops + for (auto &op : block.getOperations()) { + if (op.getAttr("onednn_graph.in_const_subgraph")) { + op.removeAttr("onednn_graph.in_const_subgraph"); + } + } + for (auto &op : foldBlock->getOperations()) { + if (op.getAttr("onednn_graph.in_const_subgraph")) { + op.removeAttr("onednn_graph.in_const_subgraph"); + } + } +} + +std::unique_ptr createCSTPass() { return std::make_unique(); } + +} // namespace gc +} // namespace mlir diff --git a/src/gc-opt/CMakeLists.txt b/src/gc-opt/CMakeLists.txt index ff33375de..b07c4dfe6 100644 --- a/src/gc-opt/CMakeLists.txt +++ b/src/gc-opt/CMakeLists.txt @@ -2,7 +2,8 @@ set(gc_opt_libs ${dialect_libs} ${conversion_libs} MLIROptLib - GCPasses) + GCPasses + GCAnalysis) if(GC_MLIR_CXX_FLAGS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GC_MLIR_CXX_FLAGS}") endif() diff --git a/test/gc/Transforms/test_constant_weights_folding.mlir b/test/gc/Transforms/test_constant_weights_folding.mlir new file mode 100644 index 000000000..52885ae7d --- /dev/null +++ b/test/gc/Transforms/test_constant_weights_folding.mlir @@ -0,0 +1,76 @@ +// RUN: gc-opt --split-input-file -pass-pipeline="builtin.module(csa,cst)" %s | FileCheck %s + +// CHECK-LABEL: func.func @entry +#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d4, d6)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2, d6 floordiv 2, d5, d3)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5)> +#map3 = affine_map<(d0, d1, d2, d3) -> (d1, d3)> +#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +module { + // COM: A two-layer mlp. arg0: input feature. arg1: weight of #1 linear. arg2: bias of #1 linear. + // COM: arg3: weight of #2 linear. arg4: bias of #2 linear. + func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> attributes {onednn_graph.const_args = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { + %1 = tensor.empty() : tensor<2x16x32x32xbf16> + %packed_arg0 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %1 : tensor<64x512xbf16> -> tensor<2x16x32x32xbf16> + %2 = tensor.empty() : tensor<8x16x32x32xbf16> + %packed_arg1 = tensor.pack %arg1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %2 : tensor<512x256xbf16> -> tensor<8x16x32x32xbf16> + %3 = tensor.empty() : tensor<8x16x16x32x2xbf16> + %packed_packed_arg1 = tensor.pack %packed_arg1 inner_dims_pos = [2] inner_tiles = [2] into %3 : tensor<8x16x32x32xbf16> -> tensor<8x16x16x32x2xbf16> + %4 = tensor.empty() : tensor<2x8x32x32xbf16> + %cst_0 = arith.constant 0.000000e+00 : bf16 + %5 = linalg.fill ins(%cst_0 : bf16) outs(%4 : tensor<2x8x32x32xbf16>) -> tensor<2x8x32x32xbf16> + %6 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%packed_arg0, %packed_packed_arg1 : tensor<2x16x32x32xbf16>, tensor<8x16x16x32x2xbf16>) outs(%5 : tensor<2x8x32x32xbf16>) { + ^bb0(%in: bf16, %in_0: bf16, %out: bf16): + %44 = arith.mulf %in, %in_0 : bf16 + %55 = arith.addf %out, %44 : bf16 + linalg.yield %55 : bf16 + } -> tensor<2x8x32x32xbf16> + %15 = tensor.empty() : tensor<8x32xbf16> + %packed_arg2 = tensor.pack %arg2 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %15 : tensor<256xbf16> -> tensor<8x32xbf16> + %bc_arg2_init = tensor.empty() : tensor<2x8x32x32xbf16> + %bc_arg2 = linalg.broadcast ins(%packed_arg2 : tensor<8x32xbf16>) outs(%bc_arg2_init : tensor<2x8x32x32xbf16>) dimensions = [0, 2] + %extf32 = arith.extf %bc_arg2 : tensor<2x8x32x32xbf16> to tensor<2x8x32x32xf32> + %cst_2 = arith.constant 2.000000e+00 : f32 + %extf32_mul2_init = tensor.empty() : tensor<2x8x32x32xf32> + %extf32_mul2 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extf32 : tensor<2x8x32x32xf32>) outs(%extf32_mul2_init : tensor<2x8x32x32xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = arith.mulf %in, %cst_2 : f32 + linalg.yield %8 : f32 + } -> tensor<2x8x32x32xf32> + %truncbf16 = arith.truncf %extf32_mul2 : tensor<2x8x32x32xf32> to tensor<2x8x32x32xbf16> + %7 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%truncbf16 : tensor<2x8x32x32xbf16>) outs(%6 : tensor<2x8x32x32xbf16>) { + ^bb0(%in: bf16, %out: bf16): + %45 = arith.addf %in, %out : bf16 + linalg.yield %45 : bf16 + } -> tensor<2x8x32x32xbf16> + %8 = tensor.empty() : tensor<32x8x32x32xbf16> + %packed_arg3 = tensor.pack %arg3 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x1024xbf16> -> tensor<32x8x32x32xbf16> + %9 = tensor.empty() : tensor<32x8x16x32x2xbf16> + %packed_packed_arg3 = tensor.pack %packed_arg3 inner_dims_pos = [2] inner_tiles = [2] into %9 : tensor<32x8x32x32xbf16> -> tensor<32x8x16x32x2xbf16> + %10 = tensor.empty() : tensor<2x32x32x32xbf16> + %11 = linalg.fill ins(%cst_0 : bf16) outs(%10 : tensor<2x32x32x32xbf16>) -> tensor<2x32x32x32xbf16> + %12 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%7, %packed_packed_arg3 : tensor<2x8x32x32xbf16>, tensor<32x8x16x32x2xbf16>) outs(%11 : tensor<2x32x32x32xbf16>) { + ^bb0(%in: bf16, %in_0: bf16, %out: bf16): + %46 = arith.mulf %in, %in_0 : bf16 + %56 = arith.addf %out, %46 : bf16 + linalg.yield %56 : bf16 + } -> tensor<2x32x32x32xbf16> + %16 = tensor.empty() : tensor<32x32xbf16> + %packed_arg4 = tensor.pack %arg4 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %16 : tensor<1024xbf16> -> tensor<32x32xbf16> + %13 = linalg.generic {indexing_maps = [#map3, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%packed_arg4 : tensor<32x32xbf16>) outs(%12 : tensor<2x32x32x32xbf16>) { + ^bb0(%in: bf16, %out: bf16): + %47 = arith.addf %in, %out : bf16 + linalg.yield %47 : bf16 + } -> tensor<2x32x32x32xbf16> + %14 = tensor.empty() : tensor<64x1024xbf16> + %unpack = tensor.unpack %13 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<2x32x32x32xbf16> -> tensor<64x1024xbf16> + return %unpack : tensor<64x1024xbf16> + } +} +// CHECK: linalg.broadcast +// CHECK: func.func @fold +// CHECK: arith.extf +// CHECK: arith.truncf +// COM: expected output: +// COM: func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> +// COM: func.func @fold(%arg0: tensor<512x256xbf16>, %arg1: tensor<256xbf16>, %arg2: tensor<256x1024xbf16>, %arg3: tensor<1024xbf16>) -> (tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16>) From 6219935a44713c214efc4c7bc7dd5c9d04e74cbe Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Mon, 27 May 2024 13:43:29 +0800 Subject: [PATCH 02/29] Add single operand check --- lib/gc/Transforms/CST.cpp | 25 +++++++++- .../test_constant_weights_folding-1.mlir | 50 +++++++++++++++++++ 2 files changed, 74 insertions(+), 1 deletion(-) create mode 100644 test/gc/Transforms/test_constant_weights_folding-1.mlir diff --git a/lib/gc/Transforms/CST.cpp b/lib/gc/Transforms/CST.cpp index 2dac0d860..73dd075cc 100644 --- a/lib/gc/Transforms/CST.cpp +++ b/lib/gc/Transforms/CST.cpp @@ -66,6 +66,23 @@ int64_t getTensorSize(TensorType t) { return size; } +bool singleOperand(Operation *op) { + if (op->getNumOperands() > 1) { + Value firstOperand = op->getOperand(0); + for (int64_t i = 1; i < op->getNumOperands(); ++i) { + Value operand = op->getOperand(i); + if (firstOperand == operand) { + continue; + } + auto parentOp = operand.getDefiningOp(); + if (parentOp && !isa(parentOp)) { + return false; + } + } + } + return true; +} + bool canMoveBefore(Operation *op) { if (op->getDialect()->getNamespace() == arith::ArithDialect::getDialectNamespace()) { @@ -341,7 +358,8 @@ void CST::runOnOperation() { while (!v.getUsers().empty()) { // v.getUsers().size() should be 1 Operation *user = *(v.getUsers().begin()); - if (!isInConstantSubgraph(user)) { + // If user is not const or user has multiple operand, we reach the end + if (!isInConstantSubgraph(user) || !singleOperand(user)) { outputTypes.push_back(v.getType()); outputValues.push_back(v); break; @@ -437,6 +455,7 @@ void CST::runOnOperation() { std::deque dq; SmallVector opsToErase; + std::unordered_set opsToEraseSet; dq.push_back(block.getArgument(id + 1)); while (!dq.empty()) { Value v = dq.front(); @@ -445,7 +464,11 @@ void CST::runOnOperation() { for (auto res : op->getResults()) { dq.push_back(res); } + if (opsToEraseSet.count(op)) { + break; + } opsToErase.push_back(op); + opsToEraseSet.insert(op); } } diff --git a/test/gc/Transforms/test_constant_weights_folding-1.mlir b/test/gc/Transforms/test_constant_weights_folding-1.mlir new file mode 100644 index 000000000..b446212c5 --- /dev/null +++ b/test/gc/Transforms/test_constant_weights_folding-1.mlir @@ -0,0 +1,50 @@ +// RUN: gc-opt --split-input-file -pass-pipeline="builtin.module(csa,cst)" %s | FileCheck %s + +// CHECK-LABEL: func.func @entry +module { + func.func @entry(%a: tensor<128xf32>, %b: tensor<128xf32>, %c: tensor<128xf32>) -> (tensor<128xf32>) attributes { llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32] } { + %c0 = arith.constant 0 : index + cpuruntime.printf "HI%zu\n" %c0 : index + %ax2 = tensor.empty() : tensor<128xf32> + %2 = linalg.add ins(%a, %a : tensor<128xf32>,tensor<128xf32>) outs(%ax2 : tensor<128xf32>) -> tensor<128xf32> + %bx2 = tensor.empty() : tensor<128xf32> + %3 = linalg.add ins(%b, %b : tensor<128xf32>,tensor<128xf32>) outs(%bx2 : tensor<128xf32>) -> tensor<128xf32> + %ax2pbx2 = tensor.empty() : tensor<128xf32> + %4 = linalg.add ins(%2, %3 : tensor<128xf32>,tensor<128xf32>) outs(%ax2pbx2 : tensor<128xf32>) -> tensor<128xf32> + %ax2pbx2pc = tensor.empty() : tensor<128xf32> + %d = linalg.add ins(%4, %c : tensor<128xf32>,tensor<128xf32>) outs(%ax2pbx2pc : tensor<128xf32>) -> tensor<128xf32> + return %d : tensor<128xf32> + } +} + +// CHECK: cpuruntime.printf +// CHECK: linalg.add +// CHECK: linalg.add +// CHECK: func.func @fold +// CHECK: linalg.add +// CHECK: linalg.add + +// COM: expected output: +// COM: module { +// COM: llvm.mlir.global constant @__num_orig_num_args(4 : i32) : i32 +// COM: llvm.mlir.global constant @__fold_buffer_ids(dense<[2, 114514, 1919810]> : tensor<3 x i64>) : !llvm.array<3 x i64> +// COM: // a,b, foldedA,foldedB +// COM: llvm.mlir.global constant @__fold_args(dense<[4, 0, 1, 4, 5]> : tensor<5xi32>) : !llvm.array<5 x i32> +// COM: // foldedA, foldedB, c, d +// COM: llvm.mlir.global constant @__compute_args(dense<[4, 4, 5, 2, 3]> : tensor<5xi32>) : !llvm.array<5 x i32> +// COM: func.func @fold(%a: tensor<128xf32>, %b: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes { llvm.emit_c_interface } { +// COM: %c0 = arith.constant 0 : index +// COM: cpuruntime.printf "HI%zu\n" %c0 : index +// COM: %out = tensor.empty() : tensor<128xf32> +// COM: %2 = linalg.add ins(%a, %a : tensor<128xf32>,tensor<128xf32>) outs(%out : tensor<128xf32>) -> tensor<128xf32> +// COM: %out2 = tensor.empty() : tensor<128xf32> +// COM: %3 = linalg.add ins(%b, %b : tensor<128xf32>,tensor<128xf32>) outs(%out2 : tensor<128xf32>) -> tensor<128xf32> +// COM: return %2, %3 : tensor<128xf32>, tensor<128xf32> +// COM: } +// COM: func.func @compute(%ax2: tensor<128xf32>, %bx2: tensor<128xf32>, %c: tensor<128xf32>) -> tensor<128xf32> attributes { llvm.emit_c_interface } { +// COM: %out = tensor.empty() : tensor<128xf32> +// COM: %2 = linalg.add ins(%ax2, %bx2 : tensor<128xf32>,tensor<128xf32>) outs(%out : tensor<128xf32>) -> tensor<128xf32> +// COM: %d = linalg.add ins(%2, %c : tensor<128xf32>,tensor<128xf32>) outs(%2 : tensor<128xf32>) -> tensor<128xf32> +// COM: return %d : tensor<128xf32> +// COM: } +// COM: } \ No newline at end of file From 5eb0ac014cd3fdf48a27eee05c71bb8aed719274 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Mon, 27 May 2024 17:01:17 +0800 Subject: [PATCH 03/29] Add cache manager --- lib/gc/Transforms/CST.cpp | 136 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 136 insertions(+) diff --git a/lib/gc/Transforms/CST.cpp b/lib/gc/Transforms/CST.cpp index 73dd075cc..b8ceb6a2c 100644 --- a/lib/gc/Transforms/CST.cpp +++ b/lib/gc/Transforms/CST.cpp @@ -20,6 +20,7 @@ #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" #include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/MemRef/IR/MemRef.h" #include "mlir/Dialect/Tensor/IR/Tensor.h" @@ -29,6 +30,8 @@ #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/Support/Debug.h" +// #include "gc/ExecutionEngine/CPURuntime/ConstantCache.hpp" + namespace mlir { namespace gc { #define GEN_PASS_DEF_CST @@ -293,6 +296,101 @@ void postponeBroadcast(Block &block) { static constexpr int DATA_SIZE_EXPANDING_THRESHOLD = 8; +// get from dnnl_graph_compiler_context +// void *allocator(size_t size) { return std::aligned_alloc(64, size); } +// void deallocator(void *ptr) { std::free(ptr); } + +// std::shared_ptr create_const_cache_proxy(size_t size) { +// // simply allocate buffer and return +// std::shared_ptr base = +// std::shared_ptr{std::aligned_alloc(64, size), [](void *p) { +// std::free(p); }}; +// return std::make_shared(base, base.get(), size, true); +// } + +size_t divide_and_ceil(size_t x, size_t y) { return (x + y - 1) / y; } + +// Manager +struct const_graph_tensor_cache_manager { + // dnnl_graph_compiler_context *ctx; + + uint64_t cached_tensor_global_id = 0; + + // singleton + static std::shared_ptr get() { + static std::shared_ptr c = + std::make_shared(); + return c; + } + + // alloc and set the buf_base_ and offset_ attributes of cache + std::vector alloc(std::vector buffers_size) { + size_t total_size = 0; + for (size_t i = 0; i < buffers_size.size(); i++) { + total_size += divide_and_ceil(buffers_size[i], 64) * 64; + } + llvm::dbgs() << "Alloc total size: " << total_size << '\n'; + // auto base = create_const_cache_proxy(total_size); + std::vector global_ids(buffers_size.size()); + size_t offset = 0; + for (size_t i = 0; i < buffers_size.size(); i++) { + llvm::dbgs() << "Alloc offset: " << offset << '\n'; + // reg_cached_tensor(cached_tensor_global_id, base, offset); + global_ids[i] = cached_tensor_global_id; + ++cached_tensor_global_id; + offset += divide_and_ceil(buffers_size[i], 64) * 64; + } + return global_ids; + } +}; + +// static void addGlobal(ModuleOp module, Location loc, OpBuilder &builder, +// StringRef name, int64_t value) { +// OpBuilder::InsertionGuard insertGuard(builder); +// builder.setInsertionPointToStart(module.getBody()); + +// auto type = IntegerType::get(builder.getContext(), 8); +// LLVM::GlobalOp global = builder.create( +// loc, type, /*isConstant=*/true, LLVM::Linkage::Internal, name, +// builder.getIndexAttr(value), +// /*alignment=*/0); +// } + +// static void addGlobalArray(ModuleOp module, Location loc, OpBuilder &builder, +// StringRef name, ArrayRef array) { +// OpBuilder::InsertionGuard insertGuard(builder); +// builder.setInsertionPointToStart(module.getBody()); + +// auto type = LLVM::LLVMArrayType::get( +// IntegerType::get(builder.getContext(), 8), array.size()); +// LLVM::GlobalOp global = builder.create( +// loc, type, /*isConstant=*/true, LLVM::Linkage::Internal, name, +// builder.getIndexArrayAttr(array), +// /*alignment=*/0); +// } + +static void addGlobalArray(ModuleOp module, Location loc, OpBuilder &builder, + StringRef name, ArrayRef array) { + OpBuilder::InsertionGuard insertGuard(builder); + builder.setInsertionPointToStart(module.getBody()); + + MemRefType type = MemRefType::Builder(array.size(), builder.getIndexType()); + IntegerAttr memrefAlignment = IntegerAttr(); + auto global = builder.create( + loc, name, + /*sym_visibility=*/builder.getStringAttr("public"), + /*type=*/type, + /*initial_value=*/builder.getIndexTensorAttr(array), + /*constant=*/true, + /*alignment=*/memrefAlignment); +} + +static void addGlobal(ModuleOp module, Location loc, OpBuilder &builder, + StringRef name, int64_t value) { + SmallVector array{value}; + addGlobalArray(module, loc, builder, name, array); +} + // Operate on tensors. Create fold() and compute() on module. The // folded weights and first-run flag is maintained by upper-level runtime. void CST::runOnOperation() { @@ -436,15 +534,38 @@ void CST::runOnOperation() { }); } + // Allocate buffer for outputValuesInFold + std::vector buffersSize; + for (Value &tensor : outputValuesInFold) { + llvm::dbgs() << "Allocate buffer for tensor: " << tensor << "\n"; + buffersSize.push_back( + getTensorSize(dyn_cast(tensor.getType()))); + } + auto manager = const_graph_tensor_cache_manager::get(); + SmallVector globalIndexes; + for (auto id : manager->alloc(buffersSize)) { + globalIndexes.push_back(id); + } + globalIndexes.insert(globalIndexes.begin(), globalIndexes.size()); + addGlobalArray(moduleOp, moduleOp.getLoc(), builder, "__fold_buffer_ids", + globalIndexes); + foldFunc.setVisibility(SymbolTable::Visibility::Public); moduleOp.push_back(foldFunc); symbolTable.insert(foldFunc); + SmallVector foldArgs; + SmallVector foldIds; + SmallVector computeArgs; + // modify the BlockArguments of block size_t oriNumArgs = block.getNumArguments(); size_t argIdx = 0; for (size_t id = 0; id < oriNumArgs; ++id) { if (constArgsIndexes.count(id) == 1) { + foldArgs.push_back(id); + foldIds.push_back(argIdx + oriNumArgs); + computeArgs.push_back(argIdx + oriNumArgs); auto loc = block.getArgument(id).getLoc(); BlockArgument foldArg = block.insertArgument(id, outputTypes[argIdx], loc); @@ -477,9 +598,24 @@ void CST::runOnOperation() { } block.eraseArgument(id + 1); ++argIdx; + } else { + computeArgs.push_back(id); } } + for (auto id : foldIds) { + foldArgs.insert(foldArgs.end(), id); + } + foldArgs.insert(foldArgs.begin(), foldArgs.size()); + addGlobalArray(moduleOp, moduleOp.getLoc(), builder, "__fold_args", foldArgs); + + computeArgs.insert(computeArgs.begin(), computeArgs.size()); + addGlobalArray(moduleOp, moduleOp.getLoc(), builder, "__compute_args", + computeArgs); + + addGlobal(moduleOp, moduleOp.getLoc(), builder, "__num_orig_num_args", + oriNumArgs); + // modify the compute func signature func::FuncOp computeFunc = cast(topFunc); FunctionType computeFuncType = computeFunc.getFunctionType(); From c3e186d7e79c286056ef8fa8c51f4452821e8c64 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Tue, 28 May 2024 10:21:52 +0800 Subject: [PATCH 04/29] Use llvm global [need to cowork with yijie/mainfunc_wrapper] --- lib/gc/Transforms/CST.cpp | 94 +++++++++++++++++++-------------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/lib/gc/Transforms/CST.cpp b/lib/gc/Transforms/CST.cpp index b8ceb6a2c..ecf7eff8f 100644 --- a/lib/gc/Transforms/CST.cpp +++ b/lib/gc/Transforms/CST.cpp @@ -30,7 +30,7 @@ #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/Support/Debug.h" -// #include "gc/ExecutionEngine/CPURuntime/ConstantCache.hpp" +#include "gc/ExecutionEngine/CPURuntime/ConstantCache.hpp" namespace mlir { namespace gc { @@ -300,13 +300,13 @@ static constexpr int DATA_SIZE_EXPANDING_THRESHOLD = 8; // void *allocator(size_t size) { return std::aligned_alloc(64, size); } // void deallocator(void *ptr) { std::free(ptr); } -// std::shared_ptr create_const_cache_proxy(size_t size) { -// // simply allocate buffer and return -// std::shared_ptr base = -// std::shared_ptr{std::aligned_alloc(64, size), [](void *p) { -// std::free(p); }}; -// return std::make_shared(base, base.get(), size, true); -// } +std::shared_ptr create_const_cache_proxy(size_t size) { + // simply allocate buffer and return + std::shared_ptr base = + std::shared_ptr{std::aligned_alloc(64, size), [](void *p) { + std::free(p); }}; + return std::make_shared(base, base.get(), size, true); +} size_t divide_and_ceil(size_t x, size_t y) { return (x + y - 1) / y; } @@ -330,12 +330,12 @@ struct const_graph_tensor_cache_manager { total_size += divide_and_ceil(buffers_size[i], 64) * 64; } llvm::dbgs() << "Alloc total size: " << total_size << '\n'; - // auto base = create_const_cache_proxy(total_size); + auto base = create_const_cache_proxy(total_size); std::vector global_ids(buffers_size.size()); size_t offset = 0; for (size_t i = 0; i < buffers_size.size(); i++) { llvm::dbgs() << "Alloc offset: " << offset << '\n'; - // reg_cached_tensor(cached_tensor_global_id, base, offset); + reg_cached_tensor(cached_tensor_global_id, base, offset); global_ids[i] = cached_tensor_global_id; ++cached_tensor_global_id; offset += divide_and_ceil(buffers_size[i], 64) * 64; @@ -344,52 +344,52 @@ struct const_graph_tensor_cache_manager { } }; -// static void addGlobal(ModuleOp module, Location loc, OpBuilder &builder, -// StringRef name, int64_t value) { -// OpBuilder::InsertionGuard insertGuard(builder); -// builder.setInsertionPointToStart(module.getBody()); - -// auto type = IntegerType::get(builder.getContext(), 8); -// LLVM::GlobalOp global = builder.create( -// loc, type, /*isConstant=*/true, LLVM::Linkage::Internal, name, -// builder.getIndexAttr(value), -// /*alignment=*/0); -// } - -// static void addGlobalArray(ModuleOp module, Location loc, OpBuilder &builder, -// StringRef name, ArrayRef array) { -// OpBuilder::InsertionGuard insertGuard(builder); -// builder.setInsertionPointToStart(module.getBody()); +static void addGlobal(ModuleOp module, Location loc, OpBuilder &builder, + StringRef name, int64_t value) { + OpBuilder::InsertionGuard insertGuard(builder); + builder.setInsertionPointToStart(module.getBody()); -// auto type = LLVM::LLVMArrayType::get( -// IntegerType::get(builder.getContext(), 8), array.size()); -// LLVM::GlobalOp global = builder.create( -// loc, type, /*isConstant=*/true, LLVM::Linkage::Internal, name, -// builder.getIndexArrayAttr(array), -// /*alignment=*/0); -// } + auto type = IntegerType::get(builder.getContext(), 8); + LLVM::GlobalOp global = builder.create( + loc, type, /*isConstant=*/true, LLVM::Linkage::Internal, name, + builder.getIndexAttr(value), + /*alignment=*/0); +} static void addGlobalArray(ModuleOp module, Location loc, OpBuilder &builder, StringRef name, ArrayRef array) { OpBuilder::InsertionGuard insertGuard(builder); builder.setInsertionPointToStart(module.getBody()); - MemRefType type = MemRefType::Builder(array.size(), builder.getIndexType()); - IntegerAttr memrefAlignment = IntegerAttr(); - auto global = builder.create( - loc, name, - /*sym_visibility=*/builder.getStringAttr("public"), - /*type=*/type, - /*initial_value=*/builder.getIndexTensorAttr(array), - /*constant=*/true, - /*alignment=*/memrefAlignment); + auto type = LLVM::LLVMArrayType::get( + IntegerType::get(builder.getContext(), 8), array.size()); + LLVM::GlobalOp global = builder.create( + loc, type, /*isConstant=*/true, LLVM::Linkage::Internal, name, + builder.getIndexArrayAttr(array), + /*alignment=*/0); } -static void addGlobal(ModuleOp module, Location loc, OpBuilder &builder, - StringRef name, int64_t value) { - SmallVector array{value}; - addGlobalArray(module, loc, builder, name, array); -} +// static void addGlobalArray(ModuleOp module, Location loc, OpBuilder &builder, +// StringRef name, ArrayRef array) { +// OpBuilder::InsertionGuard insertGuard(builder); +// builder.setInsertionPointToStart(module.getBody()); + +// MemRefType type = MemRefType::Builder(array.size(), builder.getIndexType()); +// IntegerAttr memrefAlignment = IntegerAttr(); +// auto global = builder.create( +// loc, name, +// /*sym_visibility=*/builder.getStringAttr("public"), +// /*type=*/type, +// /*initial_value=*/builder.getIndexTensorAttr(array), +// /*constant=*/true, +// /*alignment=*/memrefAlignment); +// } + +// static void addGlobal(ModuleOp module, Location loc, OpBuilder &builder, +// StringRef name, int64_t value) { +// SmallVector array{value}; +// addGlobalArray(module, loc, builder, name, array); +// } // Operate on tensors. Create fold() and compute() on module. The // folded weights and first-run flag is maintained by upper-level runtime. From 8c50b67e014dec09717ad1217703bb9132703ece Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Tue, 28 May 2024 16:09:00 +0800 Subject: [PATCH 05/29] Rename; Add llvm dependence --- include/gc/Transforms/Passes.td | 4 ++++ lib/gc/Transforms/CST.cpp | 30 +++++++++++++++--------------- src/gc-opt/CMakeLists.txt | 3 ++- 3 files changed, 21 insertions(+), 16 deletions(-) diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td index f56e8b016..5fd0bd7a7 100644 --- a/include/gc/Transforms/Passes.td +++ b/include/gc/Transforms/Passes.td @@ -45,6 +45,10 @@ def CST : Pass<"cst"> { This pass implements a constant subgraph transform. }]; let constructor = "mlir::gc::createCSTPass()"; + let dependentDialects = [ + "tensor::TensorDialect", + "linalg::LinalgDialect", + "LLVM::LLVMDialect"]; } #endif // GC_DIALECT_GC_PASSES diff --git a/lib/gc/Transforms/CST.cpp b/lib/gc/Transforms/CST.cpp index ecf7eff8f..5fb48a676 100644 --- a/lib/gc/Transforms/CST.cpp +++ b/lib/gc/Transforms/CST.cpp @@ -300,26 +300,26 @@ static constexpr int DATA_SIZE_EXPANDING_THRESHOLD = 8; // void *allocator(size_t size) { return std::aligned_alloc(64, size); } // void deallocator(void *ptr) { std::free(ptr); } -std::shared_ptr create_const_cache_proxy(size_t size) { +std::shared_ptr createConstCacheProxy(size_t size) { // simply allocate buffer and return std::shared_ptr base = std::shared_ptr{std::aligned_alloc(64, size), [](void *p) { std::free(p); }}; - return std::make_shared(base, base.get(), size, true); + return std::make_shared(base, base.get(), size, true); } -size_t divide_and_ceil(size_t x, size_t y) { return (x + y - 1) / y; } +size_t divideAndCeil(size_t x, size_t y) { return (x + y - 1) / y; } // Manager -struct const_graph_tensor_cache_manager { +struct constGraphTensorCacheManager { // dnnl_graph_compiler_context *ctx; - uint64_t cached_tensor_global_id = 0; + uint64_t cachedTensorGlobalId = 0; // singleton - static std::shared_ptr get() { - static std::shared_ptr c = - std::make_shared(); + static std::shared_ptr get() { + static std::shared_ptr c = + std::make_shared(); return c; } @@ -327,18 +327,18 @@ struct const_graph_tensor_cache_manager { std::vector alloc(std::vector buffers_size) { size_t total_size = 0; for (size_t i = 0; i < buffers_size.size(); i++) { - total_size += divide_and_ceil(buffers_size[i], 64) * 64; + total_size += divideAndCeil(buffers_size[i], 64) * 64; } llvm::dbgs() << "Alloc total size: " << total_size << '\n'; - auto base = create_const_cache_proxy(total_size); + auto base = createConstCacheProxy(total_size); std::vector global_ids(buffers_size.size()); size_t offset = 0; for (size_t i = 0; i < buffers_size.size(); i++) { llvm::dbgs() << "Alloc offset: " << offset << '\n'; - reg_cached_tensor(cached_tensor_global_id, base, offset); - global_ids[i] = cached_tensor_global_id; - ++cached_tensor_global_id; - offset += divide_and_ceil(buffers_size[i], 64) * 64; + regCachedTensor(cachedTensorGlobalId, base, offset); + global_ids[i] = cachedTensorGlobalId; + ++cachedTensorGlobalId; + offset += divideAndCeil(buffers_size[i], 64) * 64; } return global_ids; } @@ -541,7 +541,7 @@ void CST::runOnOperation() { buffersSize.push_back( getTensorSize(dyn_cast(tensor.getType()))); } - auto manager = const_graph_tensor_cache_manager::get(); + auto manager = constGraphTensorCacheManager::get(); SmallVector globalIndexes; for (auto id : manager->alloc(buffersSize)) { globalIndexes.push_back(id); diff --git a/src/gc-opt/CMakeLists.txt b/src/gc-opt/CMakeLists.txt index ac7ed4ead..6b8def4be 100644 --- a/src/gc-opt/CMakeLists.txt +++ b/src/gc-opt/CMakeLists.txt @@ -17,7 +17,8 @@ set(gc_opt_libs ${conversion_libs} ${MLIR_LINK_COMPONENTS} GCPasses - GCAnalysis) + GCAnalysis + GCCpuRuntime) if(GC_MLIR_CXX_FLAGS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GC_MLIR_CXX_FLAGS}") From 25f611eceb7d57fb3365d30d72aa38e782405228 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Tue, 28 May 2024 16:56:17 +0800 Subject: [PATCH 06/29] Change dtype --- lib/gc/Transforms/CST.cpp | 97 ++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 53 deletions(-) diff --git a/lib/gc/Transforms/CST.cpp b/lib/gc/Transforms/CST.cpp index 5fb48a676..9beaca812 100644 --- a/lib/gc/Transforms/CST.cpp +++ b/lib/gc/Transforms/CST.cpp @@ -302,9 +302,8 @@ static constexpr int DATA_SIZE_EXPANDING_THRESHOLD = 8; std::shared_ptr createConstCacheProxy(size_t size) { // simply allocate buffer and return - std::shared_ptr base = - std::shared_ptr{std::aligned_alloc(64, size), [](void *p) { - std::free(p); }}; + std::shared_ptr base = std::shared_ptr{ + std::aligned_alloc(64, size), [](void *p) { std::free(p); }}; return std::make_shared(base, base.get(), size, true); } @@ -324,72 +323,63 @@ struct constGraphTensorCacheManager { } // alloc and set the buf_base_ and offset_ attributes of cache - std::vector alloc(std::vector buffers_size) { - size_t total_size = 0; - for (size_t i = 0; i < buffers_size.size(); i++) { - total_size += divideAndCeil(buffers_size[i], 64) * 64; + std::vector alloc(std::vector buffersSize) { + size_t totalSize = 0; + for (size_t i = 0; i < buffersSize.size(); i++) { + totalSize += divideAndCeil(buffersSize[i], 64) * 64; } - llvm::dbgs() << "Alloc total size: " << total_size << '\n'; - auto base = createConstCacheProxy(total_size); - std::vector global_ids(buffers_size.size()); + llvm::dbgs() << "Alloc total size: " << totalSize << '\n'; + auto base = createConstCacheProxy(totalSize); + std::vector globalIds(buffersSize.size()); size_t offset = 0; - for (size_t i = 0; i < buffers_size.size(); i++) { + for (size_t i = 0; i < buffersSize.size(); i++) { llvm::dbgs() << "Alloc offset: " << offset << '\n'; regCachedTensor(cachedTensorGlobalId, base, offset); - global_ids[i] = cachedTensorGlobalId; + globalIds[i] = cachedTensorGlobalId; ++cachedTensorGlobalId; - offset += divideAndCeil(buffers_size[i], 64) * 64; + offset += divideAndCeil(buffersSize[i], 64) * 64; } - return global_ids; + return globalIds; } }; -static void addGlobal(ModuleOp module, Location loc, OpBuilder &builder, - StringRef name, int64_t value) { +static void addGlobalI32(ModuleOp module, Location loc, OpBuilder &builder, + StringRef name, int32_t value) { OpBuilder::InsertionGuard insertGuard(builder); builder.setInsertionPointToStart(module.getBody()); - auto type = IntegerType::get(builder.getContext(), 8); + auto type = IntegerType::get(builder.getContext(), 32); LLVM::GlobalOp global = builder.create( loc, type, /*isConstant=*/true, LLVM::Linkage::Internal, name, - builder.getIndexAttr(value), + builder.getI32IntegerAttr(value), /*alignment=*/0); } -static void addGlobalArray(ModuleOp module, Location loc, OpBuilder &builder, - StringRef name, ArrayRef array) { +static void addGlobalI64Array(ModuleOp module, Location loc, OpBuilder &builder, + StringRef name, ArrayRef array) { OpBuilder::InsertionGuard insertGuard(builder); builder.setInsertionPointToStart(module.getBody()); auto type = LLVM::LLVMArrayType::get( - IntegerType::get(builder.getContext(), 8), array.size()); + IntegerType::get(builder.getContext(), 64), array.size()); LLVM::GlobalOp global = builder.create( loc, type, /*isConstant=*/true, LLVM::Linkage::Internal, name, - builder.getIndexArrayAttr(array), + builder.getI64ArrayAttr(array), /*alignment=*/0); } -// static void addGlobalArray(ModuleOp module, Location loc, OpBuilder &builder, -// StringRef name, ArrayRef array) { -// OpBuilder::InsertionGuard insertGuard(builder); -// builder.setInsertionPointToStart(module.getBody()); - -// MemRefType type = MemRefType::Builder(array.size(), builder.getIndexType()); -// IntegerAttr memrefAlignment = IntegerAttr(); -// auto global = builder.create( -// loc, name, -// /*sym_visibility=*/builder.getStringAttr("public"), -// /*type=*/type, -// /*initial_value=*/builder.getIndexTensorAttr(array), -// /*constant=*/true, -// /*alignment=*/memrefAlignment); -// } - -// static void addGlobal(ModuleOp module, Location loc, OpBuilder &builder, -// StringRef name, int64_t value) { -// SmallVector array{value}; -// addGlobalArray(module, loc, builder, name, array); -// } +static void addGlobalI32Array(ModuleOp module, Location loc, OpBuilder &builder, + StringRef name, ArrayRef array) { + OpBuilder::InsertionGuard insertGuard(builder); + builder.setInsertionPointToStart(module.getBody()); + + auto type = LLVM::LLVMArrayType::get( + IntegerType::get(builder.getContext(), 32), array.size()); + LLVM::GlobalOp global = builder.create( + loc, type, /*isConstant=*/true, LLVM::Linkage::Internal, name, + builder.getI32ArrayAttr(array), + /*alignment=*/0); +} // Operate on tensors. Create fold() and compute() on module. The // folded weights and first-run flag is maintained by upper-level runtime. @@ -547,16 +537,16 @@ void CST::runOnOperation() { globalIndexes.push_back(id); } globalIndexes.insert(globalIndexes.begin(), globalIndexes.size()); - addGlobalArray(moduleOp, moduleOp.getLoc(), builder, "__fold_buffer_ids", - globalIndexes); + addGlobalI64Array(moduleOp, moduleOp.getLoc(), builder, "__fold_buffer_ids", + globalIndexes); foldFunc.setVisibility(SymbolTable::Visibility::Public); moduleOp.push_back(foldFunc); symbolTable.insert(foldFunc); - SmallVector foldArgs; - SmallVector foldIds; - SmallVector computeArgs; + SmallVector foldArgs; + SmallVector foldIds; + SmallVector computeArgs; // modify the BlockArguments of block size_t oriNumArgs = block.getNumArguments(); @@ -607,14 +597,15 @@ void CST::runOnOperation() { foldArgs.insert(foldArgs.end(), id); } foldArgs.insert(foldArgs.begin(), foldArgs.size()); - addGlobalArray(moduleOp, moduleOp.getLoc(), builder, "__fold_args", foldArgs); + addGlobalI32Array(moduleOp, moduleOp.getLoc(), builder, "__fold_args", + foldArgs); computeArgs.insert(computeArgs.begin(), computeArgs.size()); - addGlobalArray(moduleOp, moduleOp.getLoc(), builder, "__compute_args", - computeArgs); + addGlobalI32Array(moduleOp, moduleOp.getLoc(), builder, "__compute_args", + computeArgs); - addGlobal(moduleOp, moduleOp.getLoc(), builder, "__num_orig_num_args", - oriNumArgs); + addGlobalI32(moduleOp, moduleOp.getLoc(), builder, "__num_orig_num_args", + oriNumArgs); // modify the compute func signature func::FuncOp computeFunc = cast(topFunc); From 43639154e9e5f231667795e30923aa7eb7fdfba6 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Wed, 29 May 2024 11:22:45 +0800 Subject: [PATCH 07/29] Fix visibility and type --- lib/gc/Transforms/CST.cpp | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/lib/gc/Transforms/CST.cpp b/lib/gc/Transforms/CST.cpp index 9beaca812..dc5c332e5 100644 --- a/lib/gc/Transforms/CST.cpp +++ b/lib/gc/Transforms/CST.cpp @@ -343,41 +343,43 @@ struct constGraphTensorCacheManager { } }; -static void addGlobalI32(ModuleOp module, Location loc, OpBuilder &builder, +static void addGlobalI32(ModuleOp &module, Location loc, OpBuilder &builder, StringRef name, int32_t value) { OpBuilder::InsertionGuard insertGuard(builder); builder.setInsertionPointToStart(module.getBody()); auto type = IntegerType::get(builder.getContext(), 32); LLVM::GlobalOp global = builder.create( - loc, type, /*isConstant=*/true, LLVM::Linkage::Internal, name, + loc, type, /*isConstant=*/true, LLVM::Linkage::External, name, builder.getI32IntegerAttr(value), /*alignment=*/0); } -static void addGlobalI64Array(ModuleOp module, Location loc, OpBuilder &builder, - StringRef name, ArrayRef array) { +static void addGlobalI64Array(ModuleOp &module, Location loc, + OpBuilder &builder, StringRef name, + ArrayRef array) { OpBuilder::InsertionGuard insertGuard(builder); builder.setInsertionPointToStart(module.getBody()); auto type = LLVM::LLVMArrayType::get( IntegerType::get(builder.getContext(), 64), array.size()); LLVM::GlobalOp global = builder.create( - loc, type, /*isConstant=*/true, LLVM::Linkage::Internal, name, - builder.getI64ArrayAttr(array), + loc, type, /*isConstant=*/true, LLVM::Linkage::External, name, + builder.getI64TensorAttr(array), /*alignment=*/0); } -static void addGlobalI32Array(ModuleOp module, Location loc, OpBuilder &builder, - StringRef name, ArrayRef array) { +static void addGlobalI32Array(ModuleOp &module, Location loc, + OpBuilder &builder, StringRef name, + ArrayRef array) { OpBuilder::InsertionGuard insertGuard(builder); builder.setInsertionPointToStart(module.getBody()); auto type = LLVM::LLVMArrayType::get( IntegerType::get(builder.getContext(), 32), array.size()); LLVM::GlobalOp global = builder.create( - loc, type, /*isConstant=*/true, LLVM::Linkage::Internal, name, - builder.getI32ArrayAttr(array), + loc, type, /*isConstant=*/true, LLVM::Linkage::External, name, + builder.getI32TensorAttr(array), /*alignment=*/0); } @@ -493,7 +495,7 @@ void CST::runOnOperation() { FunctionType foldFuncType = FunctionType::get(context, inputTypes, outputTypes); - auto foldFunc = + func::FuncOp foldFunc = builder.create(topFunc.getLoc(), funcName, foldFuncType); Block *foldBlock = foldFunc.addEntryBlock(); // values of folded constant weights in foldBlock @@ -541,6 +543,8 @@ void CST::runOnOperation() { globalIndexes); foldFunc.setVisibility(SymbolTable::Visibility::Public); + foldFunc->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(), + UnitAttr::get(context)); moduleOp.push_back(foldFunc); symbolTable.insert(foldFunc); From 94f28137c2d71428deeb71e4948a7375a4be216c Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Thu, 30 May 2024 15:49:12 +0800 Subject: [PATCH 08/29] Support cpmplex topo --- lib/gc/Transforms/CST.cpp | 161 +++++++++++------- .../test_constant_weights_folding-1.mlir | 43 +++-- .../test_constant_weights_folding.mlir | 12 +- 3 files changed, 128 insertions(+), 88 deletions(-) diff --git a/lib/gc/Transforms/CST.cpp b/lib/gc/Transforms/CST.cpp index dc5c332e5..c60cea97e 100644 --- a/lib/gc/Transforms/CST.cpp +++ b/lib/gc/Transforms/CST.cpp @@ -30,7 +30,7 @@ #include "mlir/Transforms/GreedyPatternRewriteDriver.h" #include "llvm/Support/Debug.h" -#include "gc/ExecutionEngine/CPURuntime/ConstantCache.hpp" +// #include "gc/ExecutionEngine/CPURuntime/ConstantCache.hpp" namespace mlir { namespace gc { @@ -300,12 +300,12 @@ static constexpr int DATA_SIZE_EXPANDING_THRESHOLD = 8; // void *allocator(size_t size) { return std::aligned_alloc(64, size); } // void deallocator(void *ptr) { std::free(ptr); } -std::shared_ptr createConstCacheProxy(size_t size) { - // simply allocate buffer and return - std::shared_ptr base = std::shared_ptr{ - std::aligned_alloc(64, size), [](void *p) { std::free(p); }}; - return std::make_shared(base, base.get(), size, true); -} +// std::shared_ptr createConstCacheProxy(size_t size) { +// // simply allocate buffer and return +// std::shared_ptr base = std::shared_ptr{ +// std::aligned_alloc(64, size), [](void *p) { std::free(p); }}; +// return std::make_shared(base, base.get(), size, true); +// } size_t divideAndCeil(size_t x, size_t y) { return (x + y - 1) / y; } @@ -329,12 +329,12 @@ struct constGraphTensorCacheManager { totalSize += divideAndCeil(buffersSize[i], 64) * 64; } llvm::dbgs() << "Alloc total size: " << totalSize << '\n'; - auto base = createConstCacheProxy(totalSize); + // auto base = createConstCacheProxy(totalSize); std::vector globalIds(buffersSize.size()); size_t offset = 0; for (size_t i = 0; i < buffersSize.size(); i++) { llvm::dbgs() << "Alloc offset: " << offset << '\n'; - regCachedTensor(cachedTensorGlobalId, base, offset); + // regCachedTensor(cachedTensorGlobalId, base, offset); globalIds[i] = cachedTensorGlobalId; ++cachedTensorGlobalId; offset += divideAndCeil(buffersSize[i], 64) * 64; @@ -431,11 +431,11 @@ void CST::runOnOperation() { // values of folded constant weights in original block SmallVector outputValues; Value v; - // TODO: solve complicated topology. Currently we only handle simple topology - // where one constant weight input will and only will produce one constant - // output and each constant weight only contributes to one constant output. + // Support complicated topology. for (size_t id = 0; id < block.getNumArguments(); ++id) { if (constArgsIndexes.count(id) == 1) { + // The constant ops are all single-input single-output. + bool simpleTopo = true; auto arg = block.getArgument(id); if (!isa(arg.getType())) { continue; @@ -444,54 +444,72 @@ void CST::runOnOperation() { v = dyn_cast(arg); inputValues.push_back(v); SmallVector valuesOnTheWay = {v}; // the constant tensors + std::deque dq; + dq.push_back(v); // For v -> pack1 -> pack2 -> matmul, we need the type of output of pack2 - while (!v.getUsers().empty()) { - // v.getUsers().size() should be 1 - Operation *user = *(v.getUsers().begin()); - // If user is not const or user has multiple operand, we reach the end - if (!isInConstantSubgraph(user) || !singleOperand(user)) { - outputTypes.push_back(v.getType()); - outputValues.push_back(v); - break; + while (!dq.empty()) { + v = dq.front(); + dq.pop_front(); + // if the children ops of v are not all constant, we end at v + if (std::any_of(v.getUsers().begin(), v.getUsers().end(), + [](Operation *child) { + return !isInConstantSubgraph(child); + })) { + if (std::find(outputValues.begin(), outputValues.end(), v) == + outputValues.end()) { + outputTypes.push_back(v.getType()); + outputValues.push_back(v); + } + continue; + } + if (!v.hasOneUse()) { + simpleTopo = false; + } + // the children ops of v are all constant, we push their results to + // queue + for (Operation *child : v.getUsers()) { + if (!singleOperand(child) || child->getResults().size() > 1) { + simpleTopo = false; + } + for (OpResult result : child->getResults()) { + auto r = dyn_cast(result); + dq.push_back(r); + valuesOnTheWay.push_back(r); + } } - // user should has only 1 output value - OpResult result = *(user->result_begin()); - v = dyn_cast(result); - valuesOnTheWay.push_back(v); } // If data size of outputValue is too greater than size of inputValue, do // not fold it. Compare data size changes during traverse to find the last // op that satisfies this condition. - int64_t initSize = - getTensorSize(dyn_cast(valuesOnTheWay[0].getType())); - if (!isa(outputTypes.back()) || - initSize * DATA_SIZE_EXPANDING_THRESHOLD < - getTensorSize(dyn_cast(outputTypes.back()))) { - size_t lastIdx = 0; - for (size_t i = 1; i < valuesOnTheWay.size(); ++i) { - int64_t size = - getTensorSize(dyn_cast(valuesOnTheWay[i].getType())); - if (initSize * DATA_SIZE_EXPANDING_THRESHOLD > size) { - lastIdx = i; + if (simpleTopo) { + int64_t initSize = + getTensorSize(dyn_cast(valuesOnTheWay[0].getType())); + if (!isa(outputTypes.back()) || + initSize * DATA_SIZE_EXPANDING_THRESHOLD < + getTensorSize(dyn_cast(outputTypes.back()))) { + size_t lastIdx = 0; + for (size_t i = 1; i < valuesOnTheWay.size(); ++i) { + int64_t size = getTensorSize( + dyn_cast(valuesOnTheWay[i].getType())); + if (initSize * DATA_SIZE_EXPANDING_THRESHOLD > size) { + lastIdx = i; + } + } + if (lastIdx == 0) { // no suitable value found + inputTypes.pop_back(); + outputTypes.pop_back(); + inputValues.pop_back(); + outputValues.pop_back(); + constArgsIndexes.erase(id); + } else { + outputTypes.back() = valuesOnTheWay[lastIdx].getType(); + outputValues.back() = valuesOnTheWay[lastIdx]; } - } - if (lastIdx == 0) { // no suitable value found - inputTypes.pop_back(); - outputTypes.pop_back(); - inputValues.pop_back(); - outputValues.pop_back(); - constArgsIndexes.erase(id); - } else { - outputTypes.back() = valuesOnTheWay[lastIdx].getType(); - outputValues.back() = valuesOnTheWay[lastIdx]; } } } } - if (inputTypes.size() != outputTypes.size()) { - return; - } FunctionType foldFuncType = FunctionType::get(context, inputTypes, outputTypes); @@ -548,30 +566,34 @@ void CST::runOnOperation() { moduleOp.push_back(foldFunc); symbolTable.insert(foldFunc); + // the indexes of args to the folding func. SmallVector foldArgs; + // the indexes of folded args. SmallVector foldIds; + // the indexes of args to the computing func. SmallVector computeArgs; // modify the BlockArguments of block size_t oriNumArgs = block.getNumArguments(); - size_t argIdx = 0; + // Add the folded args to the end of BlockArguments list + for (size_t id = 0; id < outputValues.size(); ++id) { + auto loc = block.getArgument(id).getLoc(); + BlockArgument foldArg = + block.insertArgument(oriNumArgs + id, outputTypes[id], loc); + outputValues[id].replaceUsesWithIf(foldArg, [&](OpOperand &val) { + Operation *op = val.getOwner(); + return op->getBlock() == █ + }); + foldIds.push_back(id + oriNumArgs); + } + // Erase the operations on constant args for (size_t id = 0; id < oriNumArgs; ++id) { if (constArgsIndexes.count(id) == 1) { foldArgs.push_back(id); - foldIds.push_back(argIdx + oriNumArgs); - computeArgs.push_back(argIdx + oriNumArgs); - auto loc = block.getArgument(id).getLoc(); - BlockArgument foldArg = - block.insertArgument(id, outputTypes[argIdx], loc); - outputValues[argIdx].replaceUsesWithIf(foldArg, [&](OpOperand &val) { - Operation *op = val.getOwner(); - return op->getBlock() == █ - }); - std::deque dq; SmallVector opsToErase; std::unordered_set opsToEraseSet; - dq.push_back(block.getArgument(id + 1)); + dq.push_back(block.getArgument(id)); while (!dq.empty()) { Value v = dq.front(); dq.pop_front(); @@ -586,16 +608,26 @@ void CST::runOnOperation() { opsToEraseSet.insert(op); } } - for (auto it = opsToErase.rbegin(); it != opsToErase.rend(); ++it) { (*it)->erase(); } - block.eraseArgument(id + 1); - ++argIdx; } else { computeArgs.push_back(id); } } + // Erase the constant args in BlockArguments list + llvm::BitVector argsToErase; + for (size_t id = 0; id < oriNumArgs; ++id) { + if (constArgsIndexes.count(id) == 1) { + argsToErase.push_back(true); + } else { + argsToErase.push_back(false); + } + } + for (size_t id = 0; id < outputValues.size(); ++id) { + argsToErase.push_back(false); + } + block.eraseArguments(argsToErase); for (auto id : foldIds) { foldArgs.insert(foldArgs.end(), id); @@ -604,6 +636,9 @@ void CST::runOnOperation() { addGlobalI32Array(moduleOp, moduleOp.getLoc(), builder, "__fold_args", foldArgs); + for (auto id : foldIds) { + computeArgs.insert(computeArgs.end(), id); + } computeArgs.insert(computeArgs.begin(), computeArgs.size()); addGlobalI32Array(moduleOp, moduleOp.getLoc(), builder, "__compute_args", computeArgs); diff --git a/test/gc/Transforms/test_constant_weights_folding-1.mlir b/test/gc/Transforms/test_constant_weights_folding-1.mlir index b446212c5..940255f60 100644 --- a/test/gc/Transforms/test_constant_weights_folding-1.mlir +++ b/test/gc/Transforms/test_constant_weights_folding-1.mlir @@ -19,32 +19,31 @@ module { // CHECK: cpuruntime.printf // CHECK: linalg.add -// CHECK: linalg.add // CHECK: func.func @fold // CHECK: linalg.add // CHECK: linalg.add +// CHECK: linalg.add // COM: expected output: // COM: module { -// COM: llvm.mlir.global constant @__num_orig_num_args(4 : i32) : i32 -// COM: llvm.mlir.global constant @__fold_buffer_ids(dense<[2, 114514, 1919810]> : tensor<3 x i64>) : !llvm.array<3 x i64> -// COM: // a,b, foldedA,foldedB -// COM: llvm.mlir.global constant @__fold_args(dense<[4, 0, 1, 4, 5]> : tensor<5xi32>) : !llvm.array<5 x i32> -// COM: // foldedA, foldedB, c, d -// COM: llvm.mlir.global constant @__compute_args(dense<[4, 4, 5, 2, 3]> : tensor<5xi32>) : !llvm.array<5 x i32> -// COM: func.func @fold(%a: tensor<128xf32>, %b: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes { llvm.emit_c_interface } { -// COM: %c0 = arith.constant 0 : index -// COM: cpuruntime.printf "HI%zu\n" %c0 : index -// COM: %out = tensor.empty() : tensor<128xf32> -// COM: %2 = linalg.add ins(%a, %a : tensor<128xf32>,tensor<128xf32>) outs(%out : tensor<128xf32>) -> tensor<128xf32> -// COM: %out2 = tensor.empty() : tensor<128xf32> -// COM: %3 = linalg.add ins(%b, %b : tensor<128xf32>,tensor<128xf32>) outs(%out2 : tensor<128xf32>) -> tensor<128xf32> -// COM: return %2, %3 : tensor<128xf32>, tensor<128xf32> -// COM: } -// COM: func.func @compute(%ax2: tensor<128xf32>, %bx2: tensor<128xf32>, %c: tensor<128xf32>) -> tensor<128xf32> attributes { llvm.emit_c_interface } { -// COM: %out = tensor.empty() : tensor<128xf32> -// COM: %2 = linalg.add ins(%ax2, %bx2 : tensor<128xf32>,tensor<128xf32>) outs(%out : tensor<128xf32>) -> tensor<128xf32> -// COM: %d = linalg.add ins(%2, %c : tensor<128xf32>,tensor<128xf32>) outs(%2 : tensor<128xf32>) -> tensor<128xf32> -// COM: return %d : tensor<128xf32> -// COM: } +// COM: llvm.mlir.global external constant @__num_orig_num_args(3 : i32) {addr_space = 0 : i32} : i32 +// COM: llvm.mlir.global external constant @__compute_args(dense<[2, 2, 3]> : tensor<3xi32>) {addr_space = 0 : i32} : !llvm.array<3 x i32> +// COM: llvm.mlir.global external constant @__fold_args(dense<[3, 0, 1, 3]> : tensor<4xi32>) {addr_space = 0 : i32} : !llvm.array<4 x i32> +// COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[1, 0]> : tensor<2xi64>) {addr_space = 0 : i32} : !llvm.array<2 x i64> +// COM: func.func @entry(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32]} { +// COM: %c0 = arith.constant 0 : index +// COM: cpuruntime.printf "HI%zu\0A" %c0 : index +// COM: %0 = tensor.empty() : tensor<128xf32> +// COM: %1 = linalg.add ins(%arg1, %arg0 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) -> tensor<128xf32> +// COM: return %1 : tensor<128xf32> +// COM: } +// COM: func.func @fold(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {llvm.emit_c_interface} { +// COM: %0 = tensor.empty() : tensor<128xf32> +// COM: %1 = linalg.add ins(%arg0, %arg0 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) -> tensor<128xf32> +// COM: %2 = tensor.empty() : tensor<128xf32> +// COM: %3 = linalg.add ins(%arg1, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%2 : tensor<128xf32>) -> tensor<128xf32> +// COM: %4 = tensor.empty() : tensor<128xf32> +// COM: %5 = linalg.add ins(%1, %3 : tensor<128xf32>, tensor<128xf32>) outs(%4 : tensor<128xf32>) -> tensor<128xf32> +// COM: return %5 : tensor<128xf32> +// COM: } // COM: } \ No newline at end of file diff --git a/test/gc/Transforms/test_constant_weights_folding.mlir b/test/gc/Transforms/test_constant_weights_folding.mlir index 52885ae7d..485c11e4f 100644 --- a/test/gc/Transforms/test_constant_weights_folding.mlir +++ b/test/gc/Transforms/test_constant_weights_folding.mlir @@ -9,7 +9,7 @@ module { // COM: A two-layer mlp. arg0: input feature. arg1: weight of #1 linear. arg2: bias of #1 linear. // COM: arg3: weight of #2 linear. arg4: bias of #2 linear. - func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> attributes {onednn_graph.const_args = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { + func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, onednn_graph.const_args = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { %1 = tensor.empty() : tensor<2x16x32x32xbf16> %packed_arg0 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %1 : tensor<64x512xbf16> -> tensor<2x16x32x32xbf16> %2 = tensor.empty() : tensor<8x16x32x32xbf16> @@ -71,6 +71,12 @@ module { // CHECK: func.func @fold // CHECK: arith.extf // CHECK: arith.truncf + // COM: expected output: -// COM: func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> -// COM: func.func @fold(%arg0: tensor<512x256xbf16>, %arg1: tensor<256xbf16>, %arg2: tensor<256x1024xbf16>, %arg3: tensor<1024xbf16>) -> (tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16>) +// COM: module { +// COM: llvm.mlir.global external constant @__num_orig_num_args(5 : i32) {addr_space = 0 : i32} : i32 +// COM: llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32> +// COM: llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32> +// COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64> +// COM: func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, onednn_graph.const_args = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} +// COM: func.func @fold(%arg0: tensor<512x256xbf16>, %arg1: tensor<256xbf16>, %arg2: tensor<256x1024xbf16>, %arg3: tensor<1024xbf16>) -> (tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16>) attributes {llvm.emit_c_interface} From 0f67f75deb7874bf7cb8c0f2f9d4fede92ad3d47 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Mon, 3 Jun 2024 16:18:09 +0800 Subject: [PATCH 09/29] Rename --- ...hAnalysis.h => ConstantSubgraphAnalyser.h} | 18 +++++++------- include/gc/Transforms/Passes.h | 8 +++---- include/gc/Transforms/Passes.td | 12 +++++----- lib/gc/Analysis/CMakeLists.txt | 2 +- ...lysis.cpp => ConstantSubgraphAnalyser.cpp} | 24 +++++++++---------- lib/gc/Transforms/CMakeLists.txt | 4 ++-- .../{CSA.cpp => ConstantSubgraphAnalysis.cpp} | 20 +++++++++------- .../{CST.cpp => ConstantTensorFolding.cpp} | 14 +++++++---- src/gc-opt/CMakeLists.txt | 3 +-- ...ir => test_constant_tensor_folding-1.mlir} | 2 +- ...mlir => test_constant_tensor_folding.mlir} | 2 +- 11 files changed, 58 insertions(+), 51 deletions(-) rename include/gc/Analysis/DataFlow/{ConstantSubgraphAnalysis.h => ConstantSubgraphAnalyser.h} (90%) rename lib/gc/Analysis/DataFlow/{ConstantSubgraphAnalysis.cpp => ConstantSubgraphAnalyser.cpp} (90%) rename lib/gc/Transforms/{CSA.cpp => ConstantSubgraphAnalysis.cpp} (67%) rename lib/gc/Transforms/{CST.cpp => ConstantTensorFolding.cpp} (98%) rename test/gc/Transforms/{test_constant_weights_folding-1.mlir => test_constant_tensor_folding-1.mlir} (97%) rename test/gc/Transforms/{test_constant_weights_folding.mlir => test_constant_tensor_folding.mlir} (98%) diff --git a/include/gc/Analysis/DataFlow/ConstantSubgraphAnalysis.h b/include/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h similarity index 90% rename from include/gc/Analysis/DataFlow/ConstantSubgraphAnalysis.h rename to include/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h index fcb2939d8..a5a199914 100644 --- a/include/gc/Analysis/DataFlow/ConstantSubgraphAnalysis.h +++ b/include/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h @@ -1,4 +1,4 @@ -//===- ConstantSubgraphAnalysis.h - Constant subgraph analysis ------===// +//===- ConstantSubgraphAnalyser.h - Constant subgraph analysis ------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -13,8 +13,8 @@ // //===----------------------------------------------------------------------===// -#ifndef MLIR_ANALYSIS_DATAFLOW_CONSTANTSUBGRAPHANALYSIS_H -#define MLIR_ANALYSIS_DATAFLOW_CONSTANTSUBGRAPHANALYSIS_H +#ifndef MLIR_ANALYSIS_DATAFLOW_CONSTANTSUBGRAPHANALYSER_H +#define MLIR_ANALYSIS_DATAFLOW_CONSTANTSUBGRAPHANALYSER_H #include "mlir/Analysis/DataFlow/SparseAnalysis.h" #include @@ -87,10 +87,10 @@ class InConstantSubgraph { }; //===----------------------------------------------------------------------===// -// ConstantSubgraphAnalysis +// ConstantSubgraphAnalyser //===----------------------------------------------------------------------===// -class ConstantSubgraphAnalysis +class ConstantSubgraphAnalyser : public SparseForwardDataFlowAnalysis> { public: using SparseForwardDataFlowAnalysis::SparseForwardDataFlowAnalysis; @@ -103,13 +103,13 @@ class ConstantSubgraphAnalysis }; //===----------------------------------------------------------------------===// -// RunConstantSubgraphAnalysis +// RunConstantSubgraphAnalyser //===----------------------------------------------------------------------===// /// Runs constant subgraph analysis on the IR defined by `op`. -struct RunConstantSubgraphAnalysis { +struct RunConstantSubgraphAnalyser { public: - RunConstantSubgraphAnalysis(); + RunConstantSubgraphAnalyser(); void run(Operation *op); @@ -124,4 +124,4 @@ struct RunConstantSubgraphAnalysis { } // end namespace dataflow } // end namespace mlir -#endif // MLIR_ANALYSIS_DATAFLOW_CONSTANTSUBGRAPHANALYSIS_H +#endif // MLIR_ANALYSIS_DATAFLOW_CONSTANTSUBGRAPHANALYSER_H diff --git a/include/gc/Transforms/Passes.h b/include/gc/Transforms/Passes.h index 34d2fd487..84096279f 100644 --- a/include/gc/Transforms/Passes.h +++ b/include/gc/Transforms/Passes.h @@ -15,12 +15,12 @@ namespace mlir { namespace gc { #define GEN_PASS_DECL -#define GEN_PASS_DECL_CSA -#define GEN_PASS_DECL_CST +#define GEN_PASS_DECL_CONSTANTSUBGRAPHANALYSIS +#define GEN_PASS_DECL_CONSTANTTENSORFOLDING #include "gc/Transforms/Passes.h.inc" -std::unique_ptr createCSAPass(); -std::unique_ptr createCSTPass(); +std::unique_ptr createConstantSubgraphAnalysisPass(); +std::unique_ptr createConstantTensorFoldingPass(); #define GEN_PASS_REGISTRATION #include "gc/Transforms/Passes.h.inc" diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td index 5fd0bd7a7..bba7ea0d7 100644 --- a/include/gc/Transforms/Passes.td +++ b/include/gc/Transforms/Passes.td @@ -31,20 +31,20 @@ def ConvertOneDNNGraphToLinalg : Pass<"convert-onednn-graph-to-linalg"> { ]; } -def CSA : Pass<"csa"> { +def ConstantSubgraphAnalysis : Pass<"constant-subgraph-analysis"> { let summary = "Constant Subgraph Analysis"; let description = [{ This pass implements a constant subgraph analysis. }]; - let constructor = "mlir::gc::createCSAPass()"; + let constructor = "mlir::gc::createConstantSubgraphAnalysisPass()"; } -def CST : Pass<"cst"> { - let summary = "Constant Subgraph Transform"; +def ConstantTensorFolding : Pass<"constant-tensor-folding"> { + let summary = "Constant Tensor Folding Transform"; let description = [{ - This pass implements a constant subgraph transform. + This pass implements a constant tensor folding transform. }]; - let constructor = "mlir::gc::createCSTPass()"; + let constructor = "mlir::gc::createConstantTensorFoldingPass()"; let dependentDialects = [ "tensor::TensorDialect", "linalg::LinalgDialect", diff --git a/lib/gc/Analysis/CMakeLists.txt b/lib/gc/Analysis/CMakeLists.txt index 42c3d5541..9b5994f3d 100644 --- a/lib/gc/Analysis/CMakeLists.txt +++ b/lib/gc/Analysis/CMakeLists.txt @@ -1,5 +1,5 @@ add_mlir_library(GCAnalysis - DataFlow/ConstantSubgraphAnalysis.cpp + DataFlow/ConstantSubgraphAnalyser.cpp ADDITIONAL_HEADER_DIRS ${PROJECT_SOURCE_DIR}/include/ diff --git a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalysis.cpp b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp similarity index 90% rename from lib/gc/Analysis/DataFlow/ConstantSubgraphAnalysis.cpp rename to lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp index 2de9e5b4a..741af4697 100644 --- a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalysis.cpp +++ b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp @@ -1,11 +1,11 @@ -//===- ConstantSubgraphAnalysis.cpp - Constant subgraph analysis ----===// +//===- ConstantSubgraphAnalyser.cpp - Constant subgraph analysis ----===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -#include "gc/Analysis/DataFlow/ConstantSubgraphAnalysis.h" +#include "gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h" #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" #include "mlir/Analysis/DataFlow/SparseAnalysis.h" #include "mlir/Dialect/Arith/IR/Arith.h" @@ -46,13 +46,13 @@ void InConstantSubgraph::print(raw_ostream &os) const { } //===----------------------------------------------------------------------===// -// ConstantSubgraphAnalysis +// ConstantSubgraphAnalyser //===----------------------------------------------------------------------===// -void ConstantSubgraphAnalysis::visitOperation( +void ConstantSubgraphAnalyser::visitOperation( Operation *op, ArrayRef *> operands, ArrayRef *> results) { - LLVM_DEBUG(llvm::dbgs() << "ConstantSubgraphAnalysis: Visiting operation:\n" + LLVM_DEBUG(llvm::dbgs() << "ConstantSubgraphAnalyser: Visiting operation:\n" << *op << "\n"); bool in = true; @@ -92,7 +92,7 @@ void ConstantSubgraphAnalysis::visitOperation( } } -void ConstantSubgraphAnalysis::setToEntryState( +void ConstantSubgraphAnalyser::setToEntryState( Lattice *lattice) { if (auto blockArg = cast(lattice->getPoint())) { auto parent_op = blockArg.getParentBlock()->getParentOp(); @@ -121,12 +121,12 @@ void ConstantSubgraphAnalysis::setToEntryState( } //===----------------------------------------------------------------------===// -// RunConstantSubgraphAnalysis +// RunConstantSubgraphAnalyser //===----------------------------------------------------------------------===// /// Get the operations whose inputs and outputs are all constant values. /// These operations will be put into a seperate subgraph. -void RunConstantSubgraphAnalysis::getConstantSubgraph(DataFlowSolver &solver, +void RunConstantSubgraphAnalyser::getConstantSubgraph(DataFlowSolver &solver, Operation *topFunc) { OpBuilder builder(topFunc->getContext()); SmallVector constantOperations; @@ -161,19 +161,19 @@ void RunConstantSubgraphAnalysis::getConstantSubgraph(DataFlowSolver &solver, } } -RunConstantSubgraphAnalysis::RunConstantSubgraphAnalysis() { +RunConstantSubgraphAnalyser::RunConstantSubgraphAnalyser() { solver.load(); - solver.load(); + solver.load(); } -void RunConstantSubgraphAnalysis::run(Operation *topFunc) { +void RunConstantSubgraphAnalyser::run(Operation *topFunc) { if (failed(solver.initializeAndRun(topFunc))) { return; } getConstantSubgraph(solver, topFunc); } -bool RunConstantSubgraphAnalysis::getInConstantSubgraph(Value val) { +bool RunConstantSubgraphAnalyser::getInConstantSubgraph(Value val) { auto *lattice = solver.lookupState>(val); const InConstantSubgraph &latticeValue = lattice->getValue(); return latticeValue.getInConstantSubgraph(); diff --git a/lib/gc/Transforms/CMakeLists.txt b/lib/gc/Transforms/CMakeLists.txt index 86a58b407..205538e63 100644 --- a/lib/gc/Transforms/CMakeLists.txt +++ b/lib/gc/Transforms/CMakeLists.txt @@ -7,8 +7,8 @@ gc_set_mlir_link_components(MLIR_LINK_COMPONENTS add_mlir_library(GCPasses OneDNNGraphToLinalg.cpp TileNamed.cpp - CSA.cpp - CST.cpp + ConstantSubgraphAnalysis.cpp + ConstantTensorFolding.cpp ADDITIONAL_HEADER_DIRS ${PROJECT_SOURCE_DIR}/include diff --git a/lib/gc/Transforms/CSA.cpp b/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp similarity index 67% rename from lib/gc/Transforms/CSA.cpp rename to lib/gc/Transforms/ConstantSubgraphAnalysis.cpp index 5175be2f5..b78ecd956 100644 --- a/lib/gc/Transforms/CSA.cpp +++ b/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp @@ -1,4 +1,5 @@ -//===- CSA.cpp - Constant Subgraph Analysis -----------------===// +//===- ConstantSubgraphAnalysis.cpp - Constant Subgraph Analysis +//-----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -10,7 +11,7 @@ // in MLIR. // //===----------------------------------------------------------------------===// -#include "gc/Analysis/DataFlow/ConstantSubgraphAnalysis.h" +#include "gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h" #include "mlir/IR/Builders.h" #include "mlir/IR/Dialect.h" #include "mlir/Pass/Pass.h" @@ -18,7 +19,7 @@ namespace mlir { namespace gc { -#define GEN_PASS_DEF_CSA +#define GEN_PASS_DEF_CONSTANTSUBGRAPHANALYSIS #include "gc/Transforms/Passes.h.inc" } // namespace gc @@ -27,11 +28,12 @@ using namespace mlir::dataflow; namespace gc { -struct CSA : public impl::CSABase { +struct ConstantSubgraphAnalysis + : public impl::ConstantSubgraphAnalysisBase { void runOnOperation() override; }; -void CSA::runOnOperation() { +void ConstantSubgraphAnalysis::runOnOperation() { Operation *op = getOperation(); auto &func = op->getRegions().front().getBlocks().front().getOperations().front(); @@ -41,11 +43,13 @@ void CSA::runOnOperation() { // func.setAttr("onednn_graph.const_args", // builder.getI32ArrayAttr({1,2,3,4})); - RunConstantSubgraphAnalysis csa; - (void)csa.run(&func); + RunConstantSubgraphAnalyser runAnalyser; + (void)runAnalyser.run(&func); } -std::unique_ptr createCSAPass() { return std::make_unique(); } +std::unique_ptr createConstantSubgraphAnalysisPass() { + return std::make_unique(); +} } // namespace gc } // namespace mlir \ No newline at end of file diff --git a/lib/gc/Transforms/CST.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp similarity index 98% rename from lib/gc/Transforms/CST.cpp rename to lib/gc/Transforms/ConstantTensorFolding.cpp index c60cea97e..49e69f7d8 100644 --- a/lib/gc/Transforms/CST.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -1,4 +1,5 @@ -//===- CST.cpp - Constant Subgraph Transform -----------------===// +//===- ConstantTensorFolding.cpp - Constant Subgraph Transform +//-----------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -34,7 +35,7 @@ namespace mlir { namespace gc { -#define GEN_PASS_DEF_CST +#define GEN_PASS_DEF_CONSTANTTENSORFOLDING #include "gc/Transforms/Passes.h.inc" } // namespace gc @@ -42,7 +43,8 @@ using namespace mlir; namespace gc { -struct CST : public impl::CSTBase { +struct ConstantTensorFolding + : public impl::ConstantTensorFoldingBase { void runOnOperation() override; }; @@ -385,7 +387,7 @@ static void addGlobalI32Array(ModuleOp &module, Location loc, // Operate on tensors. Create fold() and compute() on module. The // folded weights and first-run flag is maintained by upper-level runtime. -void CST::runOnOperation() { +void ConstantTensorFolding::runOnOperation() { Operation *topOp = getOperation(); MLIRContext *context = topOp->getContext(); // A ModuleOp contains a single region, which contains a single block. @@ -679,7 +681,9 @@ void CST::runOnOperation() { } } -std::unique_ptr createCSTPass() { return std::make_unique(); } +std::unique_ptr createConstantTensorFoldingPass() { + return std::make_unique(); +} } // namespace gc } // namespace mlir diff --git a/src/gc-opt/CMakeLists.txt b/src/gc-opt/CMakeLists.txt index 6b8def4be..ac7ed4ead 100644 --- a/src/gc-opt/CMakeLists.txt +++ b/src/gc-opt/CMakeLists.txt @@ -17,8 +17,7 @@ set(gc_opt_libs ${conversion_libs} ${MLIR_LINK_COMPONENTS} GCPasses - GCAnalysis - GCCpuRuntime) + GCAnalysis) if(GC_MLIR_CXX_FLAGS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${GC_MLIR_CXX_FLAGS}") diff --git a/test/gc/Transforms/test_constant_weights_folding-1.mlir b/test/gc/Transforms/test_constant_tensor_folding-1.mlir similarity index 97% rename from test/gc/Transforms/test_constant_weights_folding-1.mlir rename to test/gc/Transforms/test_constant_tensor_folding-1.mlir index 940255f60..d54b56bad 100644 --- a/test/gc/Transforms/test_constant_weights_folding-1.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding-1.mlir @@ -1,4 +1,4 @@ -// RUN: gc-opt --split-input-file -pass-pipeline="builtin.module(csa,cst)" %s | FileCheck %s +// RUN: gc-opt --split-input-file -pass-pipeline="builtin.module(constant-subgraph-analysis,constant-tensor-folding)" %s | FileCheck %s // CHECK-LABEL: func.func @entry module { diff --git a/test/gc/Transforms/test_constant_weights_folding.mlir b/test/gc/Transforms/test_constant_tensor_folding.mlir similarity index 98% rename from test/gc/Transforms/test_constant_weights_folding.mlir rename to test/gc/Transforms/test_constant_tensor_folding.mlir index 485c11e4f..1256c52cf 100644 --- a/test/gc/Transforms/test_constant_weights_folding.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding.mlir @@ -1,4 +1,4 @@ -// RUN: gc-opt --split-input-file -pass-pipeline="builtin.module(csa,cst)" %s | FileCheck %s +// RUN: gc-opt --split-input-file -pass-pipeline="builtin.module(constant-subgraph-analysis,constant-tensor-folding)" %s | FileCheck %s // CHECK-LABEL: func.func @entry #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d4, d6)> From d7663a51a9f435e2203d9ec15bc9fa6b316dde54 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Tue, 4 Jun 2024 09:48:58 +0800 Subject: [PATCH 10/29] Split into short functions --- lib/gc/Transforms/ConstantTensorFolding.cpp | 164 ++++++++++++-------- 1 file changed, 99 insertions(+), 65 deletions(-) diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp index 49e69f7d8..59a2c75f5 100644 --- a/lib/gc/Transforms/ConstantTensorFolding.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -312,15 +312,15 @@ static constexpr int DATA_SIZE_EXPANDING_THRESHOLD = 8; size_t divideAndCeil(size_t x, size_t y) { return (x + y - 1) / y; } // Manager -struct constGraphTensorCacheManager { +struct ConstGraphTensorCacheManager { // dnnl_graph_compiler_context *ctx; uint64_t cachedTensorGlobalId = 0; // singleton - static std::shared_ptr get() { - static std::shared_ptr c = - std::make_shared(); + static std::shared_ptr get() { + static std::shared_ptr c = + std::make_shared(); return c; } @@ -385,18 +385,7 @@ static void addGlobalI32Array(ModuleOp &module, Location loc, /*alignment=*/0); } -// Operate on tensors. Create fold() and compute() on module. The -// folded weights and first-run flag is maintained by upper-level runtime. -void ConstantTensorFolding::runOnOperation() { - Operation *topOp = getOperation(); - MLIRContext *context = topOp->getContext(); - // A ModuleOp contains a single region, which contains a single block. - auto moduleOp = dyn_cast(topOp); - SymbolTable symbolTable(moduleOp); - auto &topFunc = - topOp->getRegions().front().getBlocks().front().getOperations().front(); - OpBuilder builder(context); - +std::unordered_set getConstArgsIndexes(Operation &topFunc) { auto topFuncAttr = topFunc.getAttrDictionary(); std::optional constArgs = topFuncAttr.getNamed("onednn_graph.const_args"); @@ -406,32 +395,16 @@ void ConstantTensorFolding::runOnOperation() { for (auto id : constArgsArray) { constArgsIndexes.insert(llvm::cast(id).getInt()); } - } else { - return; - } - if (constArgsIndexes.empty()) { - return; - } - - Region ®ion = topFunc.getRegions().front(); - Block &block = region.getBlocks().front(); - - postponeBroadcast(block); - - SmallVector constOps; - for (Operation &op : llvm::make_early_inc_range(block)) { - if (isInConstantSubgraph(&op)) { - constOps.push_back(&op); - } } + return constArgsIndexes; +} - std::string funcName("fold"); - SmallVector inputTypes; // types of constant weights - // values of constant weights in original block - SmallVector inputValues; - SmallVector outputTypes; // types of folded constant weights - // values of folded constant weights in original block - SmallVector outputValues; +void getInputsAndOutputs(Block &block, + std::unordered_set &constArgsIndexes, + SmallVector &inputTypes, + SmallVector &inputValues, + SmallVector &outputTypes, + SmallVector &outputValues) { Value v; // Support complicated topology. for (size_t id = 0; id < block.getNumArguments(); ++id) { @@ -512,11 +485,19 @@ void ConstantTensorFolding::runOnOperation() { } } } +} +func::FuncOp buildFoldFunc(MLIRContext *context, OpBuilder &builder, + Operation *topOp, SmallVector constOps, + SmallVector &inputTypes, + SmallVector &inputValues, + SmallVector &outputTypes, + SmallVector &outputValues) { + std::string funcName("fold"); FunctionType foldFuncType = FunctionType::get(context, inputTypes, outputTypes); func::FuncOp foldFunc = - builder.create(topFunc.getLoc(), funcName, foldFuncType); + builder.create(topOp->getLoc(), funcName, foldFuncType); Block *foldBlock = foldFunc.addEntryBlock(); // values of folded constant weights in foldBlock SmallVector outputValuesInFold; @@ -535,17 +516,6 @@ void ConstantTensorFolding::runOnOperation() { }); } - auto returnOp = - builder.create(topOp->getLoc(), outputValuesInFold); - foldBlock->getOperations().push_back(returnOp); - for (size_t i = 0; i < inputValues.size(); ++i) { - inputValues[i].replaceUsesWithIf(foldBlock->getArgument(i), - [&](OpOperand &val) { - Operation *op = val.getOwner(); - return op->getBlock() == foldBlock; - }); - } - // Allocate buffer for outputValuesInFold std::vector buffersSize; for (Value &tensor : outputValuesInFold) { @@ -553,21 +523,43 @@ void ConstantTensorFolding::runOnOperation() { buffersSize.push_back( getTensorSize(dyn_cast(tensor.getType()))); } - auto manager = constGraphTensorCacheManager::get(); + auto manager = ConstGraphTensorCacheManager::get(); SmallVector globalIndexes; for (auto id : manager->alloc(buffersSize)) { globalIndexes.push_back(id); } globalIndexes.insert(globalIndexes.begin(), globalIndexes.size()); + auto moduleOp = dyn_cast(topOp); addGlobalI64Array(moduleOp, moduleOp.getLoc(), builder, "__fold_buffer_ids", globalIndexes); + auto returnOp = + builder.create(topOp->getLoc(), outputValuesInFold); + foldBlock->getOperations().push_back(returnOp); + for (size_t i = 0; i < inputValues.size(); ++i) { + inputValues[i].replaceUsesWithIf(foldBlock->getArgument(i), + [&](OpOperand &val) { + Operation *op = val.getOwner(); + return op->getBlock() == foldBlock; + }); + } + foldFunc.setVisibility(SymbolTable::Visibility::Public); foldFunc->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(), UnitAttr::get(context)); + moduleOp.push_back(foldFunc); + SymbolTable symbolTable(moduleOp); symbolTable.insert(foldFunc); + return foldFunc; +} + +void modifyComputeFunc(MLIRContext *context, OpBuilder &builder, + Operation *topOp, Operation &func, Block &block, + std::unordered_set &constArgsIndexes, + SmallVector &outputTypes, + SmallVector &outputValues) { // the indexes of args to the folding func. SmallVector foldArgs; // the indexes of folded args. @@ -631,6 +623,13 @@ void ConstantTensorFolding::runOnOperation() { } block.eraseArguments(argsToErase); + // modify the compute func signature + func::FuncOp computeFunc = cast(func); + FunctionType computeFuncType = computeFunc.getFunctionType(); + computeFunc.setType(FunctionType::get(context, block.getArgumentTypes(), + computeFuncType.getResults())); + + auto moduleOp = dyn_cast(topOp); for (auto id : foldIds) { foldArgs.insert(foldArgs.end(), id); } @@ -647,13 +646,9 @@ void ConstantTensorFolding::runOnOperation() { addGlobalI32(moduleOp, moduleOp.getLoc(), builder, "__num_orig_num_args", oriNumArgs); +} - // modify the compute func signature - func::FuncOp computeFunc = cast(topFunc); - FunctionType computeFuncType = computeFunc.getFunctionType(); - computeFunc.setType(FunctionType::get(context, block.getArgumentTypes(), - computeFuncType.getResults())); - +void canonicalizeAndClean(MLIRContext *context, Operation *topOp) { // Delete dead operations by dialects' canonicalizer RewritePatternSet owningPatterns(context); for (auto *dialect : context->getLoadedDialects()) @@ -669,16 +664,55 @@ void ConstantTensorFolding::runOnOperation() { (void)converged; // clean up the constant-related attrs on ops - for (auto &op : block.getOperations()) { - if (op.getAttr("onednn_graph.in_const_subgraph")) { - op.removeAttr("onednn_graph.in_const_subgraph"); + topOp->walk([&](Operation *op) { + if (op->getAttr("onednn_graph.in_const_subgraph")) { + op->removeAttr("onednn_graph.in_const_subgraph"); } + }); +} + +// Operate on tensors. Create fold() and compute() on module. The +// folded weights and first-run flag is maintained by upper-level runtime. +void ConstantTensorFolding::runOnOperation() { + Operation *topOp = getOperation(); + MLIRContext *context = topOp->getContext(); + auto &topFunc = + topOp->getRegions().front().getBlocks().front().getOperations().front(); + OpBuilder builder(context); + Region ®ion = topFunc.getRegions().front(); + Block &block = region.getBlocks().front(); + + std::unordered_set constArgsIndexes = getConstArgsIndexes(topFunc); + if (constArgsIndexes.empty()) { + return; } - for (auto &op : foldBlock->getOperations()) { - if (op.getAttr("onednn_graph.in_const_subgraph")) { - op.removeAttr("onednn_graph.in_const_subgraph"); + + postponeBroadcast(block); + + SmallVector constOps; + for (Operation &op : llvm::make_early_inc_range(block)) { + if (isInConstantSubgraph(&op)) { + constOps.push_back(&op); } } + + SmallVector inputTypes; // types of constant weights + // values of constant weights in original block + SmallVector inputValues; + SmallVector outputTypes; // types of folded constant weights + // values of folded constant weights in original block + SmallVector outputValues; + getInputsAndOutputs(block, constArgsIndexes, inputTypes, inputValues, + outputTypes, outputValues); + + func::FuncOp foldFunc = + buildFoldFunc(context, builder, topOp, constOps, inputTypes, inputValues, + outputTypes, outputValues); + + modifyComputeFunc(context, builder, topOp, topFunc, block, constArgsIndexes, + outputTypes, outputValues); + + canonicalizeAndClean(context, topOp); } std::unique_ptr createConstantTensorFoldingPass() { From 3f34e971f8f72e1dec4d2d48d428965e82ddbdd2 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Wed, 5 Jun 2024 11:13:31 +0800 Subject: [PATCH 11/29] Add a test --- .../test_constant_tensor_folding-1.mlir | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/test/gc/Transforms/test_constant_tensor_folding-1.mlir b/test/gc/Transforms/test_constant_tensor_folding-1.mlir index d54b56bad..8324c9aae 100644 --- a/test/gc/Transforms/test_constant_tensor_folding-1.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding-1.mlir @@ -2,7 +2,7 @@ // CHECK-LABEL: func.func @entry module { - func.func @entry(%a: tensor<128xf32>, %b: tensor<128xf32>, %c: tensor<128xf32>) -> (tensor<128xf32>) attributes { llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32] } { + func.func @entry(%a: tensor<128xf32>, %b: tensor<128xf32>, %c: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes { llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32] } { %c0 = arith.constant 0 : index cpuruntime.printf "HI%zu\n" %c0 : index %ax2 = tensor.empty() : tensor<128xf32> @@ -11,39 +11,49 @@ module { %3 = linalg.add ins(%b, %b : tensor<128xf32>,tensor<128xf32>) outs(%bx2 : tensor<128xf32>) -> tensor<128xf32> %ax2pbx2 = tensor.empty() : tensor<128xf32> %4 = linalg.add ins(%2, %3 : tensor<128xf32>,tensor<128xf32>) outs(%ax2pbx2 : tensor<128xf32>) -> tensor<128xf32> + %ax2mbx2 = tensor.empty() : tensor<128xf32> + %5 = linalg.mul ins(%2, %3 : tensor<128xf32>,tensor<128xf32>) outs(%ax2mbx2 : tensor<128xf32>) -> tensor<128xf32> %ax2pbx2pc = tensor.empty() : tensor<128xf32> - %d = linalg.add ins(%4, %c : tensor<128xf32>,tensor<128xf32>) outs(%ax2pbx2pc : tensor<128xf32>) -> tensor<128xf32> - return %d : tensor<128xf32> + %6 = linalg.add ins(%4, %c : tensor<128xf32>,tensor<128xf32>) outs(%ax2pbx2pc : tensor<128xf32>) -> tensor<128xf32> + %ax2mbx2mc = tensor.empty() : tensor<128xf32> + %7 = linalg.mul ins(%5, %c : tensor<128xf32>,tensor<128xf32>) outs(%ax2mbx2mc : tensor<128xf32>) -> tensor<128xf32> + return %6, %7 : tensor<128xf32>, tensor<128xf32> } } // CHECK: cpuruntime.printf // CHECK: linalg.add +// CHECK: linalg.mul // CHECK: func.func @fold // CHECK: linalg.add // CHECK: linalg.add // CHECK: linalg.add +// CHECK: linalg.mul // COM: expected output: // COM: module { // COM: llvm.mlir.global external constant @__num_orig_num_args(3 : i32) {addr_space = 0 : i32} : i32 -// COM: llvm.mlir.global external constant @__compute_args(dense<[2, 2, 3]> : tensor<3xi32>) {addr_space = 0 : i32} : !llvm.array<3 x i32> -// COM: llvm.mlir.global external constant @__fold_args(dense<[3, 0, 1, 3]> : tensor<4xi32>) {addr_space = 0 : i32} : !llvm.array<4 x i32> -// COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[1, 0]> : tensor<2xi64>) {addr_space = 0 : i32} : !llvm.array<2 x i64> -// COM: func.func @entry(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32]} { +// COM: llvm.mlir.global external constant @__compute_args(dense<[3, 2, 3, 4]> : tensor<4xi32>) {addr_space = 0 : i32} : !llvm.array<4 x i32> +// COM: llvm.mlir.global external constant @__fold_args(dense<[4, 0, 1, 3, 4]> : tensor<5xi32>) {addr_space = 0 : i32} : !llvm.array<5 x i32> +// COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[2, 0, 1]> : tensor<3xi64>) {addr_space = 0 : i32} : !llvm.array<3 x i64> +// COM: func.func @entry(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes {llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32]} { // COM: %c0 = arith.constant 0 : index // COM: cpuruntime.printf "HI%zu\0A" %c0 : index // COM: %0 = tensor.empty() : tensor<128xf32> -// COM: %1 = linalg.add ins(%arg1, %arg0 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) -> tensor<128xf32> -// COM: return %1 : tensor<128xf32> +// COM: %1 = linalg.add ins(%arg2, %arg0 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) -> tensor<128xf32> +// COM: %2 = tensor.empty() : tensor<128xf32> +// COM: %3 = linalg.mul ins(%arg1, %arg0 : tensor<128xf32>, tensor<128xf32>) outs(%2 : tensor<128xf32>) -> tensor<128xf32> +// COM: return %1, %3 : tensor<128xf32>, tensor<128xf32> // COM: } -// COM: func.func @fold(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> tensor<128xf32> attributes {llvm.emit_c_interface} { +// COM: func.func @fold(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes {llvm.emit_c_interface} { // COM: %0 = tensor.empty() : tensor<128xf32> // COM: %1 = linalg.add ins(%arg0, %arg0 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) -> tensor<128xf32> // COM: %2 = tensor.empty() : tensor<128xf32> // COM: %3 = linalg.add ins(%arg1, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%2 : tensor<128xf32>) -> tensor<128xf32> // COM: %4 = tensor.empty() : tensor<128xf32> // COM: %5 = linalg.add ins(%1, %3 : tensor<128xf32>, tensor<128xf32>) outs(%4 : tensor<128xf32>) -> tensor<128xf32> -// COM: return %5 : tensor<128xf32> +// COM: %6 = tensor.empty() : tensor<128xf32> +// COM: %7 = linalg.mul ins(%1, %3 : tensor<128xf32>, tensor<128xf32>) outs(%6 : tensor<128xf32>) -> tensor<128xf32> +// COM: return %7, %5 : tensor<128xf32>, tensor<128xf32> // COM: } // COM: } \ No newline at end of file From 22c3d76a69f5745cb06a12e0bb9bcb50dea25e4e Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Tue, 11 Jun 2024 10:22:58 +0800 Subject: [PATCH 12/29] Adapt to constant PropertyType --- .../DataFlow/ConstantSubgraphAnalyser.cpp | 47 ++++++++++++------- lib/gc/Transforms/ConstantTensorFolding.cpp | 27 +++++++---- .../test_constant_tensor_folding-1.mlir | 4 +- .../test_constant_tensor_folding.mlir | 6 +-- 4 files changed, 55 insertions(+), 29 deletions(-) diff --git a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp index 741af4697..e01190bc3 100644 --- a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp +++ b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp @@ -6,6 +6,11 @@ // //===----------------------------------------------------------------------===// #include "gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h" + +#include "gc/Dialect/OneDNNGraph/OneDNNGraphDialect.h" +#include "gc/Dialect/OneDNNGraph/OneDNNGraphTypes.h" +#include "gc/Dialect/OneDNNGraph/Utils/Utils.h" + #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" #include "mlir/Analysis/DataFlow/SparseAnalysis.h" #include "mlir/Dialect/Arith/IR/Arith.h" @@ -96,22 +101,32 @@ void ConstantSubgraphAnalyser::setToEntryState( Lattice *lattice) { if (auto blockArg = cast(lattice->getPoint())) { auto parent_op = blockArg.getParentBlock()->getParentOp(); - auto parent_op_attr = parent_op->getAttrDictionary(); - std::optional const_args = - parent_op_attr.getNamed("onednn_graph.const_args"); - if (const_args.has_value()) { - ArrayAttr const_args_indexes = - llvm::dyn_cast(const_args->getValue()); - for (auto id : const_args_indexes) { - auto idint = llvm::cast(id).getInt(); - if (blockArg.getArgNumber() == idint) { - LLVM_DEBUG(llvm::dbgs() << "Block argument: " << blockArg - << " is marked as constant\n"); - propagateIfChanged(lattice, - lattice->join(InConstantSubgraph(true, true))); - return; - } - } + // auto parent_op_attr = parent_op->getAttrDictionary(); + // std::optional const_args = + // parent_op_attr.getNamed("onednn_graph.const_args"); + // if (const_args.has_value()) { + // ArrayAttr const_args_indexes = + // llvm::dyn_cast(const_args->getValue()); + // for (auto id : const_args_indexes) { + // auto idint = llvm::cast(id).getInt(); + // if (blockArg.getArgNumber() == idint) { + // LLVM_DEBUG(llvm::dbgs() << "Block argument: " << blockArg + // << " is marked as constant\n"); + // propagateIfChanged(lattice, + // lattice->join(InConstantSubgraph(true, true))); + // return; + // } + // } + // } + auto funcOp = cast(parent_op); + mlir::onednn_graph::LogicalTensorInfo info(funcOp); + if (info.queryPropertyType(blockArg) == + mlir::onednn_graph::PropertyType::constant) { + LLVM_DEBUG(llvm::dbgs() << "Block argument: " << blockArg + << " is marked as constant\n"); + propagateIfChanged(lattice, + lattice->join(InConstantSubgraph(true, true))); + return; } propagateIfChanged(lattice, lattice->join(InConstantSubgraph(true, false))); } else { diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp index 59a2c75f5..f0ed58449 100644 --- a/lib/gc/Transforms/ConstantTensorFolding.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -14,8 +14,11 @@ #include #include -#include "mlir/Transforms/Passes.h" +#include "gc/Dialect/OneDNNGraph/OneDNNGraphDialect.h" +#include "gc/Dialect/OneDNNGraph/OneDNNGraphTypes.h" +#include "gc/Dialect/OneDNNGraph/Utils/Utils.h" +#include "mlir/Transforms/Passes.h" #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" @@ -386,14 +389,22 @@ static void addGlobalI32Array(ModuleOp &module, Location loc, } std::unordered_set getConstArgsIndexes(Operation &topFunc) { - auto topFuncAttr = topFunc.getAttrDictionary(); - std::optional constArgs = - topFuncAttr.getNamed("onednn_graph.const_args"); std::unordered_set constArgsIndexes; - if (constArgs.has_value()) { - ArrayAttr constArgsArray = llvm::dyn_cast(constArgs->getValue()); - for (auto id : constArgsArray) { - constArgsIndexes.insert(llvm::cast(id).getInt()); + // auto topFuncAttr = topFunc.getAttrDictionary(); + // std::optional constArgs = + // topFuncAttr.getNamed("onednn_graph.const_args"); + // if (constArgs.has_value()) { + // ArrayAttr constArgsArray = llvm::dyn_cast(constArgs->getValue()); + // for (auto id : constArgsArray) { + // constArgsIndexes.insert(llvm::cast(id).getInt()); + // } + // } + auto funcOp = cast(topFunc); + mlir::onednn_graph::LogicalTensorInfo info(funcOp); + for (int i = 0; i < funcOp.getArguments().size(); ++i) { + if (info.queryPropertyType(funcOp.getArguments()[i]) == + mlir::onednn_graph::PropertyType::constant) { + constArgsIndexes.insert(i); } } return constArgsIndexes; diff --git a/test/gc/Transforms/test_constant_tensor_folding-1.mlir b/test/gc/Transforms/test_constant_tensor_folding-1.mlir index 8324c9aae..ec84937dc 100644 --- a/test/gc/Transforms/test_constant_tensor_folding-1.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding-1.mlir @@ -2,7 +2,7 @@ // CHECK-LABEL: func.func @entry module { - func.func @entry(%a: tensor<128xf32>, %b: tensor<128xf32>, %c: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes { llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32] } { + func.func @entry(%a: tensor<128xf32>, %b: tensor<128xf32>, %c: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes { llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32], onednn_graph.property_types = [#onednn_graph.property_type, #onednn_graph.property_type, #onednn_graph.property_type] } { %c0 = arith.constant 0 : index cpuruntime.printf "HI%zu\n" %c0 : index %ax2 = tensor.empty() : tensor<128xf32> @@ -36,7 +36,7 @@ module { // COM: llvm.mlir.global external constant @__compute_args(dense<[3, 2, 3, 4]> : tensor<4xi32>) {addr_space = 0 : i32} : !llvm.array<4 x i32> // COM: llvm.mlir.global external constant @__fold_args(dense<[4, 0, 1, 3, 4]> : tensor<5xi32>) {addr_space = 0 : i32} : !llvm.array<5 x i32> // COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[2, 0, 1]> : tensor<3xi64>) {addr_space = 0 : i32} : !llvm.array<3 x i64> -// COM: func.func @entry(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes {llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32]} { +// COM: func.func @entry(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes {llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32], onednn_graph.property_types = [#onednn_graph.property_type, #onednn_graph.property_type, #onednn_graph.property_type]} { // COM: %c0 = arith.constant 0 : index // COM: cpuruntime.printf "HI%zu\0A" %c0 : index // COM: %0 = tensor.empty() : tensor<128xf32> diff --git a/test/gc/Transforms/test_constant_tensor_folding.mlir b/test/gc/Transforms/test_constant_tensor_folding.mlir index 1256c52cf..2c82b3e67 100644 --- a/test/gc/Transforms/test_constant_tensor_folding.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding.mlir @@ -9,7 +9,7 @@ module { // COM: A two-layer mlp. arg0: input feature. arg1: weight of #1 linear. arg2: bias of #1 linear. // COM: arg3: weight of #2 linear. arg4: bias of #2 linear. - func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, onednn_graph.const_args = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { + func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, onednn_graph.const_args = [1 : i32, 2 : i32, 3 : i32, 4 : i32], onednn_graph.property_types = [#onednn_graph.property_type, #onednn_graph.property_type, #onednn_graph.property_type, #onednn_graph.property_type, #onednn_graph.property_type]} { %1 = tensor.empty() : tensor<2x16x32x32xbf16> %packed_arg0 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %1 : tensor<64x512xbf16> -> tensor<2x16x32x32xbf16> %2 = tensor.empty() : tensor<8x16x32x32xbf16> @@ -78,5 +78,5 @@ module { // COM: llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32> // COM: llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32> // COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64> -// COM: func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, onednn_graph.const_args = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} -// COM: func.func @fold(%arg0: tensor<512x256xbf16>, %arg1: tensor<256xbf16>, %arg2: tensor<256x1024xbf16>, %arg3: tensor<1024xbf16>) -> (tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16>) attributes {llvm.emit_c_interface} +// COM: func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> +// COM: func.func @fold(%arg0: tensor<512x256xbf16>, %arg1: tensor<256xbf16>, %arg2: tensor<256x1024xbf16>, %arg3: tensor<1024xbf16>) -> (tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16>) From 9218762cf8f1a6ea4c8981e0ad6504348a4693d7 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Tue, 23 Jul 2024 20:22:26 -0700 Subject: [PATCH 13/29] Revert "Adapt to constant PropertyType" This reverts commit 22c3d76a69f5745cb06a12e0bb9bcb50dea25e4e. --- .../DataFlow/ConstantSubgraphAnalyser.cpp | 47 +++++++------------ lib/gc/Transforms/ConstantTensorFolding.cpp | 27 ++++------- .../test_constant_tensor_folding-1.mlir | 4 +- .../test_constant_tensor_folding.mlir | 6 +-- 4 files changed, 29 insertions(+), 55 deletions(-) diff --git a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp index e01190bc3..741af4697 100644 --- a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp +++ b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp @@ -6,11 +6,6 @@ // //===----------------------------------------------------------------------===// #include "gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h" - -#include "gc/Dialect/OneDNNGraph/OneDNNGraphDialect.h" -#include "gc/Dialect/OneDNNGraph/OneDNNGraphTypes.h" -#include "gc/Dialect/OneDNNGraph/Utils/Utils.h" - #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" #include "mlir/Analysis/DataFlow/SparseAnalysis.h" #include "mlir/Dialect/Arith/IR/Arith.h" @@ -101,32 +96,22 @@ void ConstantSubgraphAnalyser::setToEntryState( Lattice *lattice) { if (auto blockArg = cast(lattice->getPoint())) { auto parent_op = blockArg.getParentBlock()->getParentOp(); - // auto parent_op_attr = parent_op->getAttrDictionary(); - // std::optional const_args = - // parent_op_attr.getNamed("onednn_graph.const_args"); - // if (const_args.has_value()) { - // ArrayAttr const_args_indexes = - // llvm::dyn_cast(const_args->getValue()); - // for (auto id : const_args_indexes) { - // auto idint = llvm::cast(id).getInt(); - // if (blockArg.getArgNumber() == idint) { - // LLVM_DEBUG(llvm::dbgs() << "Block argument: " << blockArg - // << " is marked as constant\n"); - // propagateIfChanged(lattice, - // lattice->join(InConstantSubgraph(true, true))); - // return; - // } - // } - // } - auto funcOp = cast(parent_op); - mlir::onednn_graph::LogicalTensorInfo info(funcOp); - if (info.queryPropertyType(blockArg) == - mlir::onednn_graph::PropertyType::constant) { - LLVM_DEBUG(llvm::dbgs() << "Block argument: " << blockArg - << " is marked as constant\n"); - propagateIfChanged(lattice, - lattice->join(InConstantSubgraph(true, true))); - return; + auto parent_op_attr = parent_op->getAttrDictionary(); + std::optional const_args = + parent_op_attr.getNamed("onednn_graph.const_args"); + if (const_args.has_value()) { + ArrayAttr const_args_indexes = + llvm::dyn_cast(const_args->getValue()); + for (auto id : const_args_indexes) { + auto idint = llvm::cast(id).getInt(); + if (blockArg.getArgNumber() == idint) { + LLVM_DEBUG(llvm::dbgs() << "Block argument: " << blockArg + << " is marked as constant\n"); + propagateIfChanged(lattice, + lattice->join(InConstantSubgraph(true, true))); + return; + } + } } propagateIfChanged(lattice, lattice->join(InConstantSubgraph(true, false))); } else { diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp index f0ed58449..59a2c75f5 100644 --- a/lib/gc/Transforms/ConstantTensorFolding.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -14,11 +14,8 @@ #include #include -#include "gc/Dialect/OneDNNGraph/OneDNNGraphDialect.h" -#include "gc/Dialect/OneDNNGraph/OneDNNGraphTypes.h" -#include "gc/Dialect/OneDNNGraph/Utils/Utils.h" - #include "mlir/Transforms/Passes.h" + #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" @@ -389,22 +386,14 @@ static void addGlobalI32Array(ModuleOp &module, Location loc, } std::unordered_set getConstArgsIndexes(Operation &topFunc) { + auto topFuncAttr = topFunc.getAttrDictionary(); + std::optional constArgs = + topFuncAttr.getNamed("onednn_graph.const_args"); std::unordered_set constArgsIndexes; - // auto topFuncAttr = topFunc.getAttrDictionary(); - // std::optional constArgs = - // topFuncAttr.getNamed("onednn_graph.const_args"); - // if (constArgs.has_value()) { - // ArrayAttr constArgsArray = llvm::dyn_cast(constArgs->getValue()); - // for (auto id : constArgsArray) { - // constArgsIndexes.insert(llvm::cast(id).getInt()); - // } - // } - auto funcOp = cast(topFunc); - mlir::onednn_graph::LogicalTensorInfo info(funcOp); - for (int i = 0; i < funcOp.getArguments().size(); ++i) { - if (info.queryPropertyType(funcOp.getArguments()[i]) == - mlir::onednn_graph::PropertyType::constant) { - constArgsIndexes.insert(i); + if (constArgs.has_value()) { + ArrayAttr constArgsArray = llvm::dyn_cast(constArgs->getValue()); + for (auto id : constArgsArray) { + constArgsIndexes.insert(llvm::cast(id).getInt()); } } return constArgsIndexes; diff --git a/test/gc/Transforms/test_constant_tensor_folding-1.mlir b/test/gc/Transforms/test_constant_tensor_folding-1.mlir index ec84937dc..8324c9aae 100644 --- a/test/gc/Transforms/test_constant_tensor_folding-1.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding-1.mlir @@ -2,7 +2,7 @@ // CHECK-LABEL: func.func @entry module { - func.func @entry(%a: tensor<128xf32>, %b: tensor<128xf32>, %c: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes { llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32], onednn_graph.property_types = [#onednn_graph.property_type, #onednn_graph.property_type, #onednn_graph.property_type] } { + func.func @entry(%a: tensor<128xf32>, %b: tensor<128xf32>, %c: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes { llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32] } { %c0 = arith.constant 0 : index cpuruntime.printf "HI%zu\n" %c0 : index %ax2 = tensor.empty() : tensor<128xf32> @@ -36,7 +36,7 @@ module { // COM: llvm.mlir.global external constant @__compute_args(dense<[3, 2, 3, 4]> : tensor<4xi32>) {addr_space = 0 : i32} : !llvm.array<4 x i32> // COM: llvm.mlir.global external constant @__fold_args(dense<[4, 0, 1, 3, 4]> : tensor<5xi32>) {addr_space = 0 : i32} : !llvm.array<5 x i32> // COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[2, 0, 1]> : tensor<3xi64>) {addr_space = 0 : i32} : !llvm.array<3 x i64> -// COM: func.func @entry(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes {llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32], onednn_graph.property_types = [#onednn_graph.property_type, #onednn_graph.property_type, #onednn_graph.property_type]} { +// COM: func.func @entry(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes {llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32]} { // COM: %c0 = arith.constant 0 : index // COM: cpuruntime.printf "HI%zu\0A" %c0 : index // COM: %0 = tensor.empty() : tensor<128xf32> diff --git a/test/gc/Transforms/test_constant_tensor_folding.mlir b/test/gc/Transforms/test_constant_tensor_folding.mlir index 2c82b3e67..1256c52cf 100644 --- a/test/gc/Transforms/test_constant_tensor_folding.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding.mlir @@ -9,7 +9,7 @@ module { // COM: A two-layer mlp. arg0: input feature. arg1: weight of #1 linear. arg2: bias of #1 linear. // COM: arg3: weight of #2 linear. arg4: bias of #2 linear. - func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, onednn_graph.const_args = [1 : i32, 2 : i32, 3 : i32, 4 : i32], onednn_graph.property_types = [#onednn_graph.property_type, #onednn_graph.property_type, #onednn_graph.property_type, #onednn_graph.property_type, #onednn_graph.property_type]} { + func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, onednn_graph.const_args = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { %1 = tensor.empty() : tensor<2x16x32x32xbf16> %packed_arg0 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %1 : tensor<64x512xbf16> -> tensor<2x16x32x32xbf16> %2 = tensor.empty() : tensor<8x16x32x32xbf16> @@ -78,5 +78,5 @@ module { // COM: llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32> // COM: llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32> // COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64> -// COM: func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> -// COM: func.func @fold(%arg0: tensor<512x256xbf16>, %arg1: tensor<256xbf16>, %arg2: tensor<256x1024xbf16>, %arg3: tensor<1024xbf16>) -> (tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16>) +// COM: func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, onednn_graph.const_args = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} +// COM: func.func @fold(%arg0: tensor<512x256xbf16>, %arg1: tensor<256xbf16>, %arg2: tensor<256x1024xbf16>, %arg3: tensor<1024xbf16>) -> (tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16>) attributes {llvm.emit_c_interface} From 4e447dd52e67899deba8d17321667ea62beda70d Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Tue, 23 Jul 2024 22:17:07 -0700 Subject: [PATCH 14/29] Fix link --- lib/gc/ExecutionEngine/Driver/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/gc/ExecutionEngine/Driver/CMakeLists.txt b/lib/gc/ExecutionEngine/Driver/CMakeLists.txt index d04dbbb4e..688607b56 100644 --- a/lib/gc/ExecutionEngine/Driver/CMakeLists.txt +++ b/lib/gc/ExecutionEngine/Driver/CMakeLists.txt @@ -37,6 +37,7 @@ add_mlir_library(GCJitWrapper ${dialect_libs} ${conversion_libs} GCPasses + GCAnalysis GCGPUPasses ) From d4d81a62b030e205f04274cc51fec57d54fa15bf Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Wed, 24 Jul 2024 23:08:43 -0700 Subject: [PATCH 15/29] Fold arith.constant --- .../DataFlow/ConstantSubgraphAnalyser.cpp | 45 ++++++----- .../Transforms/ConstantSubgraphAnalysis.cpp | 2 +- lib/gc/Transforms/ConstantTensorFolding.cpp | 76 ++++++++++++++++--- .../test_constant_tensor_folding-1.mlir | 4 +- .../test_constant_tensor_folding-2.mlir | 61 +++++++++++++++ .../test_constant_tensor_folding.mlir | 4 +- 6 files changed, 160 insertions(+), 32 deletions(-) create mode 100644 test/gc/Transforms/test_constant_tensor_folding-2.mlir diff --git a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp index 741af4697..584c7e8ce 100644 --- a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp +++ b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp @@ -5,6 +5,9 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// +#include +#include + #include "gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h" #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h" #include "mlir/Analysis/DataFlow/SparseAnalysis.h" @@ -25,7 +28,6 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/Support/Casting.h" #include "llvm/Support/Debug.h" -#include #define DEBUG_TYPE "in-constant-subgraph" @@ -95,24 +97,33 @@ void ConstantSubgraphAnalyser::visitOperation( void ConstantSubgraphAnalyser::setToEntryState( Lattice *lattice) { if (auto blockArg = cast(lattice->getPoint())) { - auto parent_op = blockArg.getParentBlock()->getParentOp(); - auto parent_op_attr = parent_op->getAttrDictionary(); - std::optional const_args = - parent_op_attr.getNamed("onednn_graph.const_args"); - if (const_args.has_value()) { - ArrayAttr const_args_indexes = - llvm::dyn_cast(const_args->getValue()); - for (auto id : const_args_indexes) { - auto idint = llvm::cast(id).getInt(); - if (blockArg.getArgNumber() == idint) { - LLVM_DEBUG(llvm::dbgs() << "Block argument: " << blockArg - << " is marked as constant\n"); - propagateIfChanged(lattice, - lattice->join(InConstantSubgraph(true, true))); - return; - } + auto parentOp = blockArg.getParentBlock()->getParentOp(); + auto parentOpAttr = parentOp->getAttrDictionary(); + + std::unordered_set constArgsIndexes; + std::optional compiletimeConstArgs = + parentOpAttr.getNamed("compiletime_const_args_index"); + if (compiletimeConstArgs.has_value()) { + for (auto id : + llvm::dyn_cast(compiletimeConstArgs->getValue())) { + constArgsIndexes.insert(llvm::cast(id).getInt()); + } + } + std::optional runtimeConstArgs = + parentOpAttr.getNamed("runtime_const_args_index"); + if (runtimeConstArgs.has_value()) { + for (auto id : llvm::dyn_cast(runtimeConstArgs->getValue())) { + constArgsIndexes.insert(llvm::cast(id).getInt()); } } + + if (constArgsIndexes.count(blockArg.getArgNumber())) { + LLVM_DEBUG(llvm::dbgs() << "Block argument: " << blockArg + << " is marked as constant\n"); + propagateIfChanged(lattice, + lattice->join(InConstantSubgraph(true, true))); + return; + } propagateIfChanged(lattice, lattice->join(InConstantSubgraph(true, false))); } else { propagateIfChanged(lattice, diff --git a/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp b/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp index b78ecd956..d4f183326 100644 --- a/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp +++ b/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp @@ -40,7 +40,7 @@ void ConstantSubgraphAnalysis::runOnOperation() { // Hard-code: set the #1 argument to be constant. // OpBuilder builder(op->getContext()); - // func.setAttr("onednn_graph.const_args", + // func.setAttr("runtime_const_args_index", // builder.getI32ArrayAttr({1,2,3,4})); RunConstantSubgraphAnalyser runAnalyser; diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp index 59a2c75f5..2df13adcc 100644 --- a/lib/gc/Transforms/ConstantTensorFolding.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -355,6 +355,7 @@ static void addGlobalI32(ModuleOp &module, Location loc, OpBuilder &builder, loc, type, /*isConstant=*/true, LLVM::Linkage::External, name, builder.getI32IntegerAttr(value), /*alignment=*/0); + (void)global; } static void addGlobalI64Array(ModuleOp &module, Location loc, @@ -369,6 +370,7 @@ static void addGlobalI64Array(ModuleOp &module, Location loc, loc, type, /*isConstant=*/true, LLVM::Linkage::External, name, builder.getI64TensorAttr(array), /*alignment=*/0); + (void)global; } static void addGlobalI32Array(ModuleOp &module, Location loc, @@ -383,22 +385,74 @@ static void addGlobalI32Array(ModuleOp &module, Location loc, loc, type, /*isConstant=*/true, LLVM::Linkage::External, name, builder.getI32TensorAttr(array), /*alignment=*/0); + (void)global; } std::unordered_set getConstArgsIndexes(Operation &topFunc) { auto topFuncAttr = topFunc.getAttrDictionary(); - std::optional constArgs = - topFuncAttr.getNamed("onednn_graph.const_args"); std::unordered_set constArgsIndexes; - if (constArgs.has_value()) { - ArrayAttr constArgsArray = llvm::dyn_cast(constArgs->getValue()); - for (auto id : constArgsArray) { + std::optional compiletimeConstArgs = + topFuncAttr.getNamed("compiletime_const_args_index"); + if (compiletimeConstArgs.has_value()) { + for (auto id : + llvm::dyn_cast(compiletimeConstArgs->getValue())) { + constArgsIndexes.insert(llvm::cast(id).getInt()); + } + } + std::optional runtimeConstArgs = + topFuncAttr.getNamed("runtime_const_args_index"); + if (runtimeConstArgs.has_value()) { + for (auto id : llvm::dyn_cast(runtimeConstArgs->getValue())) { constArgsIndexes.insert(llvm::cast(id).getInt()); } } return constArgsIndexes; } +void getArithConstantOutputs(Block &block, SmallVector &outputTypes, + SmallVector &outputValues) { + for (Operation &op : block.getOperations()) { + if (isa(&op)) { + Operation *constOp = &op; + auto constTensor = constOp->getResults().front(); + if (!isa(constTensor.getType())) { + continue; + } + auto v = dyn_cast(constTensor); + SmallVector valuesOnTheWay = {v}; // the constant tensors + std::deque dq; + dq.push_back(v); + // For v -> pack1 -> pack2 -> matmul, we need the type of output of pack2 + while (!dq.empty()) { + v = dq.front(); + dq.pop_front(); + // if the children ops of v are not all constant, we end at v + if (std::any_of(v.getUsers().begin(), v.getUsers().end(), + [](Operation *child) { + return !isInConstantSubgraph(child); + })) { + if (std::find(outputValues.begin(), outputValues.end(), v) == + outputValues.end()) { + outputTypes.push_back(v.getType()); + outputValues.push_back(v); + } + continue; + } + + // the children ops of v are all constant, we push their results to + // queue + for (Operation *child : v.getUsers()) { + for (OpResult result : child->getResults()) { + auto r = dyn_cast(result); + dq.push_back(r); + valuesOnTheWay.push_back(r); + } + } + } + } + } +} + void getInputsAndOutputs(Block &block, std::unordered_set &constArgsIndexes, SmallVector &inputTypes, @@ -499,7 +553,7 @@ func::FuncOp buildFoldFunc(MLIRContext *context, OpBuilder &builder, func::FuncOp foldFunc = builder.create(topOp->getLoc(), funcName, foldFuncType); Block *foldBlock = foldFunc.addEntryBlock(); - // values of folded constant weights in foldBlock + // values of folded constant tensors in foldBlock SmallVector outputValuesInFold; IRMapping mapper; for (Operation *op : constOps) { @@ -696,18 +750,20 @@ void ConstantTensorFolding::runOnOperation() { } } - SmallVector inputTypes; // types of constant weights - // values of constant weights in original block + SmallVector inputTypes; // types of constant tensors + // values of constant tensors in original block SmallVector inputValues; - SmallVector outputTypes; // types of folded constant weights - // values of folded constant weights in original block + SmallVector outputTypes; // types of folded constant tensors + // values of folded constant tensors in original block SmallVector outputValues; + getArithConstantOutputs(block, outputTypes, outputValues); getInputsAndOutputs(block, constArgsIndexes, inputTypes, inputValues, outputTypes, outputValues); func::FuncOp foldFunc = buildFoldFunc(context, builder, topOp, constOps, inputTypes, inputValues, outputTypes, outputValues); + (void)foldFunc; modifyComputeFunc(context, builder, topOp, topFunc, block, constArgsIndexes, outputTypes, outputValues); diff --git a/test/gc/Transforms/test_constant_tensor_folding-1.mlir b/test/gc/Transforms/test_constant_tensor_folding-1.mlir index 8324c9aae..fa4fcb210 100644 --- a/test/gc/Transforms/test_constant_tensor_folding-1.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding-1.mlir @@ -2,7 +2,7 @@ // CHECK-LABEL: func.func @entry module { - func.func @entry(%a: tensor<128xf32>, %b: tensor<128xf32>, %c: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes { llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32] } { + func.func @entry(%a: tensor<128xf32>, %b: tensor<128xf32>, %c: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes { llvm.emit_c_interface, runtime_const_args_index = [0 : i32, 1 : i32] } { %c0 = arith.constant 0 : index cpuruntime.printf "HI%zu\n" %c0 : index %ax2 = tensor.empty() : tensor<128xf32> @@ -36,7 +36,7 @@ module { // COM: llvm.mlir.global external constant @__compute_args(dense<[3, 2, 3, 4]> : tensor<4xi32>) {addr_space = 0 : i32} : !llvm.array<4 x i32> // COM: llvm.mlir.global external constant @__fold_args(dense<[4, 0, 1, 3, 4]> : tensor<5xi32>) {addr_space = 0 : i32} : !llvm.array<5 x i32> // COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[2, 0, 1]> : tensor<3xi64>) {addr_space = 0 : i32} : !llvm.array<3 x i64> -// COM: func.func @entry(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes {llvm.emit_c_interface, onednn_graph.const_args = [0 : i32, 1 : i32]} { +// COM: func.func @entry(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes {llvm.emit_c_interface, runtime_const_args_index = [0 : i32, 1 : i32]} { // COM: %c0 = arith.constant 0 : index // COM: cpuruntime.printf "HI%zu\0A" %c0 : index // COM: %0 = tensor.empty() : tensor<128xf32> diff --git a/test/gc/Transforms/test_constant_tensor_folding-2.mlir b/test/gc/Transforms/test_constant_tensor_folding-2.mlir new file mode 100644 index 000000000..8d9e4ed53 --- /dev/null +++ b/test/gc/Transforms/test_constant_tensor_folding-2.mlir @@ -0,0 +1,61 @@ +// RUN: gc-opt --split-input-file -pass-pipeline="builtin.module(constant-subgraph-analysis,constant-tensor-folding)" %s | FileCheck %s + +// CHECK-LABEL: func.func @entry +#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d4, d6)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2, d6 floordiv 2, d5, d3)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5)> +module { + // COM: A three-layer mlp. %arg0: input feature. %arg1, %arg2, %arg3: weight of #1, #2 and #3 linear. + func.func @entry(%arg0: tensor<64x32xbf16>, %arg2: tensor<32x256xbf16>, %arg3: tensor<256x1024xbf16>) + -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, compiletime_const_args_index = [1 : i32], runtime_const_args_index = [2 : i32]} { + %1 = tensor.empty() : tensor<2x1x32x32xbf16> + %packed_arg0 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %1 : tensor<64x32xbf16> -> tensor<2x1x32x32xbf16> + + %arg1 = arith.constant dense<"0x99571CBE05BA1C3D926AFCBD782B34BE67A737BEBF181ABE3C4E253B5D32F73D1566963D9B6F313EB74C0FBD430B253AE8E2E23DB0CBC53C46C014BE2E0981BD4D313F3E833D37BEAB70E13D65B6CA3DB194983D1E60983D950B71BD1815FDBB32DF9A3DD106EBBDB4A8233E841EC3BDE8C7C13D3734073EFF067DBD070206BEF6AF633DB209843C2135C3BD4F85B83C43BD1CBE04841A3E3E78BD3DE9D0D0BCF660093ED074083E14D43E3ECDA735BE8C8C0E3E40C60FBE4F73C9BDB4358DBD263D323C64E61EBEE535D23D238F013C727EA73DBDBAA1BD79D53EBE392981BDC06B453D10E37D3D2D2B41BEE1FA6BBD410E513D05588BBD514AB0BB0624243E3D80993C8E6A113EE57CFD3D23FE37BE001573BD86AD143E7F052D3E97C07DBD19B4113D3E87F6BDB971E83DFEA12BBC5D51F9BD4F203A3ED454043E22775BBD2EE8313EB027D03D8FEFD7BD0E56B7BDBF963FBE5B64E93D9291FBBD027101BE573DFD3D0CD6EB3D809B863DA9E8263E9EF2A43D717AB73D3CF597BD9FB7243DC603003D61780E3E3992293D8B1B25BE6B0024BE806DCB3D5BAB91BD9A33AFBDD5BC3BBE6D920FBE0D90F53D4513383E2219A0BBE8B6FBBD341C42BD42F235BED91A1ABDC3AEB0BD5AC1383DE0EADC3D303D11BE850D263E8281163E5CB78A3D19EB34BE33150F3E84F8EE3D18FC823DB26CCBBD09AB06BED909FFBA605EFE3B9014B7BD1606DA3D75ACE13D0910753C33C6843DE9951CBECD220ABD0EF2BF3D14BB2E3C798718BD60A53A3E8B83E53D18663DBE4D07CABD37CE043EA6B18E3D3D0F303EE392073EC92A1ABED6900E3E72D3E73D8CEF803D1B4D3D3E997D283E210F923BC2D131BECEAF913DB981EFBDCBCCCCBA2B6711BE4E32FE3C5D5D33BD2F34313EB7EC48BC26CDFD3D07170B3E1CD816BE310DD2BD9E03023E1EA8F3BD8B99EEBBFC97433E047F8DBDDD6BA03DA3B2433E34D7C0BC7FDB89BA1980333EF3FC8D3DC05C203E9C7213BD8385403E2F971A3E4357CF3DB39BFBBC784FF8BC7DBD0C3E8301E23D77BF1ABB04F3243CFBA3B1BD5A46C6BD1745A8BDD6950ABD939CC5BDB4226EBCAC622EBD6748FBBDAFF9D53DF29D433E41991C3D4DD7353EE2EF8E3D21EF3B3DF679973D31DEFDBDF0AF303E8D34DFBB31B895BD6A633A3EACE125BEE94E95BDA58043BEC9F233BE915F03BD1B7C8F3DE1D367BDD7BBD63D6E990A3E23222F3D4B6CD73DB869C53D8697383E3A86853D973F2C3EFC3827BC4E87FA3DD5903BBE4BB8403E34A9A33D41C8843D4BC8FABD3CD5E8BD4946233D955052BDA5F841BC6C81AFBD5DD8883DB71A753CD0A1263D88690ABE35DAA73CA3557D3D8C09D23D5A27273DECEFDBBCD220023EE036ACBD6CD2443E8F630FBEBC43B73DF03AA4BDC709133E1B94E73D362CE4BCB15F33BE3139443E5FCF62BD0E3C1B3EE99DF93D9E1BB3BA70DB213E38EBDDBC47F10CBEF817293DAD3DEB3B730942BE535C87BD448D7B3B1C8094BD97962B3D5B0F3B3EA3F42A3E4ED46DBD6D72C33C687CC63DEA34C53D1CCC3EBEDCA640BE638ABCBD4B63AFBDA699063E92861E3E98219FBC8E0B233ED3ED573DC856B8BD13880F3EFA0763BD5A8C89BD194519BE89C6CF3D73A219BC5ECBD43D41EFA33D27D8493D756B1ABEC796C93D9A25133C6A5A363E13FB8DBD601755BD3935FABD14D6883D0EF2D33DB8E914BD527347397200433DE72A3F3B62C52F3ED164EF3CD8806FBD05528B3D89701EBE0A09C23DA19B103D05922EBE7A100E3E31C0503D8ED53BBE08463E3E5168013E55F3E53D782EC53DA8BBD93C1711223E05FDB2BDA740113EA27A20BD1685A23D7E35293E02BD8B3CC43F163E4AE6613DE4280F3EEEF20BBE965C1DBEFAAD233E75754E3D96C33BBCB6D7013E0D8E7ABD703C82BDEA0875BC6F57A6BCE83609BE8A8EB53DAB7D3C3E39A50ABEB878A33D9FCEA1BC124AD33C22C34A3DB5F338BE0307BF3C2F0881BD7E15E8BDBEE8C8BDBBFFA63C342F303E15B1CCBB2590153EEA05EF3DE778F2BCE9E1233ECEC244BDBF92D5BDECDEAE3C29750CBDD969FCBD7DC236BE571D1DBEC8FA7DBC243BAD3C38673D3ED15943BEFE4D913D5329273E18AB2EBE19AB5F3D30A62F3E94303CBE1421DABCBE6E133E355D073EEC76633DEB2AB83DA2BF16BC9A46C2BD4EB47EBC4C82343EC1D1E63D13D314BED232E3BD3E5CF1BDC78F9EBD6483233E7290293E514A163E255F0FBE1AEF7BBD5259173EF12524BEDF47793C886BE8BD57B408BE351980BD0FF71ABD24643ABEA79920BED2603A3EEB75393EC6D52B3E458B29BC22C45ABC02BB40BCED4BDEBCA6E9CABC11FB213EC4FB363E5AC2DCBDAD6B4F3CBB85B1BD8093343E487518BEDFA316BD7FFFAEBB9375963DF68A88BD6876013C9FA1C63D95CDB23C911721BE04B5F9BD1B7C8F3DE1D367BDD7BBD63D6E990A3E23222F3D4B6CD73DB869C53D8697383E3A86853D973F2C3EFC3827BC4E87FA3DD5903BBE4BB8403E34A9A33D41C8843D4BC8FABD3CD5E8BD4946233D955052BDA5F841BC6C81AFBD5DD8883DB71A753CD0A1263D88690ABE35DAA73CA3557D3D8C09D23D5A27273DECEFDBBCD220023EE036ACBD6CD2443E8F630FBEBC43B73DF03AA4BDC709133E1B94E73D362CE4BCB15F33BE3139443E5FCF62BD0E3C1B3EE99DF93D9E1BB3BA70DB213E38EBDDBC47F10CBEF817293DAD3997D283E210F923BC2D131BECEAF913DB981EFBDCBCCCCBA2B6711BE4E32FE3C5D5D33BD2F34313EB7EC48BC26CDFD3D07170B3E1CD816BE310DD2BD9E03023E1EA8F3BD8B99EEBBFC97433E047F8DBDDD6BA03DA3B2433E34D7C0BC7FDB89BA1980333EF3EB7EC48B383DE0E383DE0E383DE0E383DE0"> : tensor<32x32xbf16> + %2 = tensor.empty() : tensor<1x1x32x32xbf16> + %packed_arg1 = tensor.pack %arg1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %2 : tensor<32x32xbf16> -> tensor<1x1x32x32xbf16> + %3 = tensor.empty() : tensor<1x1x16x32x2xbf16> + %packed_packed_arg1 = tensor.pack %packed_arg1 inner_dims_pos = [2] inner_tiles = [2] into %3 : tensor<1x1x32x32xbf16> -> tensor<1x1x16x32x2xbf16> + + %4 = tensor.empty() : tensor<2x1x32x32xbf16> + %cst_0 = arith.constant 0.000000e+00 : bf16 + %5 = linalg.fill ins(%cst_0 : bf16) outs(%4 : tensor<2x1x32x32xbf16>) -> tensor<2x1x32x32xbf16> + %6 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%packed_arg0, %packed_packed_arg1 : tensor<2x1x32x32xbf16>, tensor<1x1x16x32x2xbf16>) outs(%5 : tensor<2x1x32x32xbf16>) { + ^bb0(%in: bf16, %in_0: bf16, %out: bf16): + %44 = arith.mulf %in, %in_0 : bf16 + %55 = arith.addf %out, %44 : bf16 + linalg.yield %55 : bf16 + } -> tensor<2x1x32x32xbf16> + + %7 = tensor.empty() : tensor<8x1x32x32xbf16> + %packed_arg2 = tensor.pack %arg2 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %7 : tensor<32x256xbf16> -> tensor<8x1x32x32xbf16> + %8 = tensor.empty() : tensor<8x1x16x32x2xbf16> + %packed_packed_arg2 = tensor.pack %packed_arg2 inner_dims_pos = [2] inner_tiles = [2] into %8 : tensor<8x1x32x32xbf16> -> tensor<8x1x16x32x2xbf16> + %9 = tensor.empty() : tensor<2x8x32x32xbf16> + %10 = linalg.fill ins(%cst_0 : bf16) outs(%9 : tensor<2x8x32x32xbf16>) -> tensor<2x8x32x32xbf16> + %11 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%6, %packed_packed_arg2 : tensor<2x1x32x32xbf16>, tensor<8x1x16x32x2xbf16>) outs(%10 : tensor<2x8x32x32xbf16>) { + ^bb0(%in: bf16, %in_0: bf16, %out: bf16): + %44 = arith.mulf %in, %in_0 : bf16 + %55 = arith.addf %out, %44 : bf16 + linalg.yield %55 : bf16 + } -> tensor<2x8x32x32xbf16> + + %12 = tensor.empty() : tensor<32x8x32x32xbf16> + %packed_arg3 = tensor.pack %arg3 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %12 : tensor<256x1024xbf16> -> tensor<32x8x32x32xbf16> + %13 = tensor.empty() : tensor<32x8x16x32x2xbf16> + %packed_packed_arg3 = tensor.pack %packed_arg3 inner_dims_pos = [2] inner_tiles = [2] into %13 : tensor<32x8x32x32xbf16> -> tensor<32x8x16x32x2xbf16> + + %14 = tensor.empty() : tensor<2x32x32x32xbf16> + %15 = linalg.fill ins(%cst_0 : bf16) outs(%14 : tensor<2x32x32x32xbf16>) -> tensor<2x32x32x32xbf16> + %16 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%11, %packed_packed_arg3 : tensor<2x8x32x32xbf16>, tensor<32x8x16x32x2xbf16>) outs(%15 : tensor<2x32x32x32xbf16>) { + ^bb0(%in: bf16, %in_0: bf16, %out: bf16): + %46 = arith.mulf %in, %in_0 : bf16 + %56 = arith.addf %out, %46 : bf16 + linalg.yield %56 : bf16 + } -> tensor<2x32x32x32xbf16> + + %17 = tensor.empty() : tensor<64x1024xbf16> + %unpack = tensor.unpack %16 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %17 : tensor<2x32x32x32xbf16> -> tensor<64x1024xbf16> + return %unpack : tensor<64x1024xbf16> + } +} diff --git a/test/gc/Transforms/test_constant_tensor_folding.mlir b/test/gc/Transforms/test_constant_tensor_folding.mlir index 1256c52cf..d55f42039 100644 --- a/test/gc/Transforms/test_constant_tensor_folding.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding.mlir @@ -9,7 +9,7 @@ module { // COM: A two-layer mlp. arg0: input feature. arg1: weight of #1 linear. arg2: bias of #1 linear. // COM: arg3: weight of #2 linear. arg4: bias of #2 linear. - func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, onednn_graph.const_args = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { + func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, runtime_const_args_index = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { %1 = tensor.empty() : tensor<2x16x32x32xbf16> %packed_arg0 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %1 : tensor<64x512xbf16> -> tensor<2x16x32x32xbf16> %2 = tensor.empty() : tensor<8x16x32x32xbf16> @@ -78,5 +78,5 @@ module { // COM: llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32> // COM: llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32> // COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64> -// COM: func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, onednn_graph.const_args = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} +// COM: func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, runtime_const_args_index = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} // COM: func.func @fold(%arg0: tensor<512x256xbf16>, %arg1: tensor<256xbf16>, %arg2: tensor<256x1024xbf16>, %arg3: tensor<1024xbf16>) -> (tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16>) attributes {llvm.emit_c_interface} From afec52ae8f788972cef6a9573330aa15e0a60526 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Thu, 25 Jul 2024 01:40:16 -0700 Subject: [PATCH 16/29] Add compile_time_fold and runtime_fold. --- lib/gc/Transforms/ConstantTensorFolding.cpp | 97 +++++++++++++-------- 1 file changed, 62 insertions(+), 35 deletions(-) diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp index 2df13adcc..ce75e4409 100644 --- a/lib/gc/Transforms/ConstantTensorFolding.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -11,11 +11,10 @@ // //===----------------------------------------------------------------------===// +#include "mlir/Transforms/Passes.h" #include #include -#include "mlir/Transforms/Passes.h" - #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" @@ -388,21 +387,15 @@ static void addGlobalI32Array(ModuleOp &module, Location loc, (void)global; } -std::unordered_set getConstArgsIndexes(Operation &topFunc) { +std::unordered_set getConstArgsIndexes(Operation &topFunc, + bool compiletime) { auto topFuncAttr = topFunc.getAttrDictionary(); std::unordered_set constArgsIndexes; - std::optional compiletimeConstArgs = - topFuncAttr.getNamed("compiletime_const_args_index"); - if (compiletimeConstArgs.has_value()) { - for (auto id : - llvm::dyn_cast(compiletimeConstArgs->getValue())) { - constArgsIndexes.insert(llvm::cast(id).getInt()); - } - } - std::optional runtimeConstArgs = - topFuncAttr.getNamed("runtime_const_args_index"); - if (runtimeConstArgs.has_value()) { - for (auto id : llvm::dyn_cast(runtimeConstArgs->getValue())) { + std::string attrName = + compiletime ? "compiletime_const_args_index" : "runtime_const_args_index"; + std::optional constArgs = topFuncAttr.getNamed(attrName); + if (constArgs.has_value()) { + for (auto id : llvm::dyn_cast(constArgs->getValue())) { constArgsIndexes.insert(llvm::cast(id).getInt()); } } @@ -542,16 +535,16 @@ void getInputsAndOutputs(Block &block, } func::FuncOp buildFoldFunc(MLIRContext *context, OpBuilder &builder, - Operation *topOp, SmallVector constOps, + Operation *topOp, std::string name, + SmallVector constOps, SmallVector &inputTypes, SmallVector &inputValues, SmallVector &outputTypes, SmallVector &outputValues) { - std::string funcName("fold"); FunctionType foldFuncType = FunctionType::get(context, inputTypes, outputTypes); func::FuncOp foldFunc = - builder.create(topOp->getLoc(), funcName, foldFuncType); + builder.create(topOp->getLoc(), name, foldFuncType); Block *foldBlock = foldFunc.addEntryBlock(); // values of folded constant tensors in foldBlock SmallVector outputValuesInFold; @@ -584,8 +577,8 @@ func::FuncOp buildFoldFunc(MLIRContext *context, OpBuilder &builder, } globalIndexes.insert(globalIndexes.begin(), globalIndexes.size()); auto moduleOp = dyn_cast(topOp); - addGlobalI64Array(moduleOp, moduleOp.getLoc(), builder, "__fold_buffer_ids", - globalIndexes); + addGlobalI64Array(moduleOp, moduleOp.getLoc(), builder, + "__" + name + "_buffer_ids_", globalIndexes); auto returnOp = builder.create(topOp->getLoc(), outputValuesInFold); @@ -736,8 +729,11 @@ void ConstantTensorFolding::runOnOperation() { Region ®ion = topFunc.getRegions().front(); Block &block = region.getBlocks().front(); - std::unordered_set constArgsIndexes = getConstArgsIndexes(topFunc); - if (constArgsIndexes.empty()) { + std::unordered_set compiletimeConstArgsIndexes = + getConstArgsIndexes(topFunc, true); + std::unordered_set runtimeConstArgsIndexes = + getConstArgsIndexes(topFunc, false); + if (compiletimeConstArgsIndexes.empty() && runtimeConstArgsIndexes.empty()) { return; } @@ -750,21 +746,52 @@ void ConstantTensorFolding::runOnOperation() { } } - SmallVector inputTypes; // types of constant tensors + // ===== build compile time folding function ===== + SmallVector compiletimeInputTypes; // types of constant tensors // values of constant tensors in original block - SmallVector inputValues; - SmallVector outputTypes; // types of folded constant tensors + SmallVector compiletimeInputValues; + SmallVector compiletimeOutputTypes; // types of folded constant tensors // values of folded constant tensors in original block - SmallVector outputValues; - getArithConstantOutputs(block, outputTypes, outputValues); - getInputsAndOutputs(block, constArgsIndexes, inputTypes, inputValues, - outputTypes, outputValues); - - func::FuncOp foldFunc = - buildFoldFunc(context, builder, topOp, constOps, inputTypes, inputValues, - outputTypes, outputValues); - (void)foldFunc; - + SmallVector compiletimeOutputValues; + getArithConstantOutputs(block, compiletimeOutputTypes, + compiletimeOutputValues); + getInputsAndOutputs(block, compiletimeConstArgsIndexes, compiletimeInputTypes, + compiletimeInputValues, compiletimeOutputTypes, + compiletimeOutputValues); + + func::FuncOp compiletimeFoldFunc = + buildFoldFunc(context, builder, topOp, "compiletime_fold", constOps, + compiletimeInputTypes, compiletimeInputValues, + compiletimeOutputTypes, compiletimeOutputValues); + (void)compiletimeFoldFunc; + canonicalizeAndClean(context, compiletimeFoldFunc.getOperation()); + + // ===== build runtime folding function ===== + SmallVector runtimeInputTypes; // types of constant tensors + // values of constant tensors in original block + SmallVector runtimeInputValues; + SmallVector runtimeOutputTypes; // types of folded constant tensors + // values of folded constant tensors in original block + SmallVector runtimeOutputValues; + getInputsAndOutputs(block, runtimeConstArgsIndexes, runtimeInputTypes, + runtimeInputValues, runtimeOutputTypes, + runtimeOutputValues); + + func::FuncOp runtimeFoldFunc = buildFoldFunc( + context, builder, topOp, "runtime_fold", constOps, runtimeInputTypes, + runtimeInputValues, runtimeOutputTypes, runtimeOutputValues); + (void)runtimeFoldFunc; + canonicalizeAndClean(context, runtimeFoldFunc.getOperation()); + + // ===== build computing function ===== + std::unordered_set constArgsIndexes = compiletimeConstArgsIndexes; + constArgsIndexes.merge(runtimeConstArgsIndexes); + SmallVector outputTypes = compiletimeOutputTypes; + outputTypes.insert(outputTypes.end(), runtimeOutputTypes.begin(), + runtimeOutputTypes.end()); + SmallVector outputValues = compiletimeOutputValues; + outputValues.insert(outputValues.end(), runtimeOutputValues.begin(), + runtimeOutputValues.end()); modifyComputeFunc(context, builder, topOp, topFunc, block, constArgsIndexes, outputTypes, outputValues); From 9c4fd70a0d1a1ce23a233f9b0d6a4d3481821bb4 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Thu, 25 Jul 2024 19:38:14 -0700 Subject: [PATCH 17/29] Fix license and tidy --- .../gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h | 4 ++-- .../Analysis/DataFlow/ConstantSubgraphAnalyser.cpp | 13 ++++++------- lib/gc/Transforms/ConstantSubgraphAnalysis.cpp | 5 ++--- lib/gc/Transforms/ConstantTensorFolding.cpp | 13 ++++++------- 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/include/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h b/include/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h index a5a199914..d2dc4ffa4 100644 --- a/include/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h +++ b/include/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h @@ -1,6 +1,6 @@ -//===- ConstantSubgraphAnalyser.h - Constant subgraph analysis ------===// +//===-- ConstantSubgraphAnalyser.h - Constant subgraph ----------*- C++ -*-===// // -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // diff --git a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp index 584c7e8ce..640b3ef59 100644 --- a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp +++ b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp @@ -1,6 +1,6 @@ -//===- ConstantSubgraphAnalyser.cpp - Constant subgraph analysis ----===// +//===-- ConstantSubgraphAnalyser.cpp - Constant subgraph -------*- C++ -*-===// // -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // @@ -44,7 +44,6 @@ void InConstantSubgraph::print(raw_ostream &os) const { return; } os << getInConstantSubgraph(); - return; } //===----------------------------------------------------------------------===// @@ -61,7 +60,7 @@ void ConstantSubgraphAnalyser::visitOperation( if (op->hasTrait()) { LLVM_DEBUG(llvm::dbgs() << "Curr op is a Constant op\n"); in = true; - } else if (operands.size() == 0) { // For example, tensor.empty() + } else if (operands.empty()) { // For example, tensor.empty() LLVM_DEBUG(llvm::dbgs() << "Curr op has 0 operand, constant\n"); in = true; } else { @@ -177,11 +176,11 @@ RunConstantSubgraphAnalyser::RunConstantSubgraphAnalyser() { solver.load(); } -void RunConstantSubgraphAnalyser::run(Operation *topFunc) { - if (failed(solver.initializeAndRun(topFunc))) { +void RunConstantSubgraphAnalyser::run(Operation *op) { + if (failed(solver.initializeAndRun(op))) { return; } - getConstantSubgraph(solver, topFunc); + getConstantSubgraph(solver, op); } bool RunConstantSubgraphAnalyser::getInConstantSubgraph(Value val) { diff --git a/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp b/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp index d4f183326..ed481720b 100644 --- a/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp +++ b/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp @@ -1,7 +1,6 @@ -//===- ConstantSubgraphAnalysis.cpp - Constant Subgraph Analysis -//-----------------===// +//===-- ConstantSubgraphAnalysis.cpp - Constant Subgraph --------*- C++ -*-===// // -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp index ce75e4409..5b9e7a5b4 100644 --- a/lib/gc/Transforms/ConstantTensorFolding.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -1,7 +1,6 @@ -//===- ConstantTensorFolding.cpp - Constant Subgraph Transform -//-----------------===// +//===-- ConstantTensorFolding.cpp - Constant Folding ------------*- C++ -*-===// // -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // @@ -326,8 +325,8 @@ struct ConstGraphTensorCacheManager { // alloc and set the buf_base_ and offset_ attributes of cache std::vector alloc(std::vector buffersSize) { size_t totalSize = 0; - for (size_t i = 0; i < buffersSize.size(); i++) { - totalSize += divideAndCeil(buffersSize[i], 64) * 64; + for (size_t size : buffersSize) { + totalSize += divideAndCeil(size, 64) * 64; } llvm::dbgs() << "Alloc total size: " << totalSize << '\n'; // auto base = createConstCacheProxy(totalSize); @@ -535,8 +534,8 @@ void getInputsAndOutputs(Block &block, } func::FuncOp buildFoldFunc(MLIRContext *context, OpBuilder &builder, - Operation *topOp, std::string name, - SmallVector constOps, + Operation *topOp, const std::string &name, + const SmallVector &constOps, SmallVector &inputTypes, SmallVector &inputValues, SmallVector &outputTypes, From fad5f92f94f5681b62abc4f522871625b41b0d50 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Fri, 26 Jul 2024 01:54:29 -0700 Subject: [PATCH 18/29] Fix link --- CMakeLists.txt | 2 ++ lib/gc/Analysis/CMakeLists.txt | 6 ++++++ lib/gc/CAPI/CMakeLists.txt | 1 + .../Transforms/test_constant_tensor_folding-1.mlir | 2 +- .../Transforms/test_constant_tensor_folding-2.mlir | 14 ++++++++++++++ .../Transforms/test_constant_tensor_folding.mlir | 2 +- 6 files changed, 25 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 636b33ad2..07164d7da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -106,6 +106,8 @@ endif() set(GC_LIB_LINKED_LIBS GCJitWrapper GCCpuRuntime + GCPasses + GCAnalysis ) add_mlir_library(graph_compiler SHARED ${GC_LIB_SOURCES}) target_include_directories(graph_compiler PUBLIC ${GC_LIB_INCLUDES}) diff --git a/lib/gc/Analysis/CMakeLists.txt b/lib/gc/Analysis/CMakeLists.txt index 9b5994f3d..403748041 100644 --- a/lib/gc/Analysis/CMakeLists.txt +++ b/lib/gc/Analysis/CMakeLists.txt @@ -1,3 +1,7 @@ +gc_set_mlir_link_components(MLIR_LINK_COMPONENTS + MLIRIR + MLIRSupport) + add_mlir_library(GCAnalysis DataFlow/ConstantSubgraphAnalyser.cpp @@ -14,3 +18,5 @@ add_mlir_library(GCAnalysis MLIRBufferizationToMemRef MLIRBufferizationPipelines ) + +set_property(GLOBAL APPEND PROPERTY GC_PASS_LIBS GCAnalysis) diff --git a/lib/gc/CAPI/CMakeLists.txt b/lib/gc/CAPI/CMakeLists.txt index 1d2e7687e..aca399ad7 100644 --- a/lib/gc/CAPI/CMakeLists.txt +++ b/lib/gc/CAPI/CMakeLists.txt @@ -6,5 +6,6 @@ add_mlir_public_c_api_library(GcCAPI MLIRCPURuntimeDialect GCPasses GCGPUPasses + GCAnalysis MLIRCPURuntimeTransforms ) \ No newline at end of file diff --git a/test/gc/Transforms/test_constant_tensor_folding-1.mlir b/test/gc/Transforms/test_constant_tensor_folding-1.mlir index fa4fcb210..0664edafb 100644 --- a/test/gc/Transforms/test_constant_tensor_folding-1.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding-1.mlir @@ -24,7 +24,7 @@ module { // CHECK: cpuruntime.printf // CHECK: linalg.add // CHECK: linalg.mul -// CHECK: func.func @fold +// CHECK: func.func @runtime_fold // CHECK: linalg.add // CHECK: linalg.add // CHECK: linalg.add diff --git a/test/gc/Transforms/test_constant_tensor_folding-2.mlir b/test/gc/Transforms/test_constant_tensor_folding-2.mlir index 8d9e4ed53..85208815e 100644 --- a/test/gc/Transforms/test_constant_tensor_folding-2.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding-2.mlir @@ -59,3 +59,17 @@ module { return %unpack : tensor<64x1024xbf16> } } + +// COM: 1 pack in entry for input feature, +// COM: 4 packs in compiletime_fold for 2 weights, +// COM: 2 packs in runtime_fold for 1 weights + +// CHECK: tensor.pack +// CHECK: func.func @compiletime_fold +// CHECK: tensor.pack +// CHECK: tensor.pack +// CHECK: tensor.pack +// CHECK: tensor.pack +// CHECK: func.func @runtime_fold +// CHECK: tensor.pack +// CHECK: tensor.pack diff --git a/test/gc/Transforms/test_constant_tensor_folding.mlir b/test/gc/Transforms/test_constant_tensor_folding.mlir index d55f42039..71f475c00 100644 --- a/test/gc/Transforms/test_constant_tensor_folding.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding.mlir @@ -68,7 +68,7 @@ module { } } // CHECK: linalg.broadcast -// CHECK: func.func @fold +// CHECK: func.func @runtime_fold // CHECK: arith.extf // CHECK: arith.truncf From 57f887dbee1671337ab3d367ee8573568b6fbaaa Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Sun, 28 Jul 2024 19:34:20 -0700 Subject: [PATCH 19/29] Only enable runtime folding --- lib/gc/Transforms/ConstantTensorFolding.cpp | 124 +++++++++++------- .../test_constant_tensor_folding-2.mlir | 16 ++- 2 files changed, 89 insertions(+), 51 deletions(-) diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp index 5b9e7a5b4..3f38dda77 100644 --- a/lib/gc/Transforms/ConstantTensorFolding.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -745,54 +745,82 @@ void ConstantTensorFolding::runOnOperation() { } } - // ===== build compile time folding function ===== - SmallVector compiletimeInputTypes; // types of constant tensors - // values of constant tensors in original block - SmallVector compiletimeInputValues; - SmallVector compiletimeOutputTypes; // types of folded constant tensors - // values of folded constant tensors in original block - SmallVector compiletimeOutputValues; - getArithConstantOutputs(block, compiletimeOutputTypes, - compiletimeOutputValues); - getInputsAndOutputs(block, compiletimeConstArgsIndexes, compiletimeInputTypes, - compiletimeInputValues, compiletimeOutputTypes, - compiletimeOutputValues); - - func::FuncOp compiletimeFoldFunc = - buildFoldFunc(context, builder, topOp, "compiletime_fold", constOps, - compiletimeInputTypes, compiletimeInputValues, - compiletimeOutputTypes, compiletimeOutputValues); - (void)compiletimeFoldFunc; - canonicalizeAndClean(context, compiletimeFoldFunc.getOperation()); - - // ===== build runtime folding function ===== - SmallVector runtimeInputTypes; // types of constant tensors - // values of constant tensors in original block - SmallVector runtimeInputValues; - SmallVector runtimeOutputTypes; // types of folded constant tensors - // values of folded constant tensors in original block - SmallVector runtimeOutputValues; - getInputsAndOutputs(block, runtimeConstArgsIndexes, runtimeInputTypes, - runtimeInputValues, runtimeOutputTypes, - runtimeOutputValues); - - func::FuncOp runtimeFoldFunc = buildFoldFunc( - context, builder, topOp, "runtime_fold", constOps, runtimeInputTypes, - runtimeInputValues, runtimeOutputTypes, runtimeOutputValues); - (void)runtimeFoldFunc; - canonicalizeAndClean(context, runtimeFoldFunc.getOperation()); - - // ===== build computing function ===== - std::unordered_set constArgsIndexes = compiletimeConstArgsIndexes; - constArgsIndexes.merge(runtimeConstArgsIndexes); - SmallVector outputTypes = compiletimeOutputTypes; - outputTypes.insert(outputTypes.end(), runtimeOutputTypes.begin(), - runtimeOutputTypes.end()); - SmallVector outputValues = compiletimeOutputValues; - outputValues.insert(outputValues.end(), runtimeOutputValues.begin(), - runtimeOutputValues.end()); - modifyComputeFunc(context, builder, topOp, topFunc, block, constArgsIndexes, - outputTypes, outputValues); + bool enableCompiletimeFolding = false; + if (enableCompiletimeFolding) { + // ===== build compile time folding function ===== + SmallVector compiletimeInputTypes; // types of constant tensors + // values of constant tensors in original block + SmallVector compiletimeInputValues; + SmallVector + compiletimeOutputTypes; // types of folded constant tensors + // values of folded constant tensors in original block + SmallVector compiletimeOutputValues; + getArithConstantOutputs(block, compiletimeOutputTypes, + compiletimeOutputValues); + getInputsAndOutputs(block, compiletimeConstArgsIndexes, + compiletimeInputTypes, compiletimeInputValues, + compiletimeOutputTypes, compiletimeOutputValues); + + func::FuncOp compiletimeFoldFunc = + buildFoldFunc(context, builder, topOp, "compiletime_fold", constOps, + compiletimeInputTypes, compiletimeInputValues, + compiletimeOutputTypes, compiletimeOutputValues); + (void)compiletimeFoldFunc; + canonicalizeAndClean(context, compiletimeFoldFunc.getOperation()); + + // ===== build runtime folding function ===== + SmallVector runtimeInputTypes; // types of constant tensors + // values of constant tensors in original block + SmallVector runtimeInputValues; + SmallVector runtimeOutputTypes; // types of folded constant tensors + // values of folded constant tensors in original block + SmallVector runtimeOutputValues; + getInputsAndOutputs(block, runtimeConstArgsIndexes, runtimeInputTypes, + runtimeInputValues, runtimeOutputTypes, + runtimeOutputValues); + + func::FuncOp runtimeFoldFunc = buildFoldFunc( + context, builder, topOp, "runtime_fold", constOps, runtimeInputTypes, + runtimeInputValues, runtimeOutputTypes, runtimeOutputValues); + (void)runtimeFoldFunc; + canonicalizeAndClean(context, runtimeFoldFunc.getOperation()); + + // ===== build computing function ===== + std::unordered_set constArgsIndexes = compiletimeConstArgsIndexes; + constArgsIndexes.merge(runtimeConstArgsIndexes); + SmallVector outputTypes = compiletimeOutputTypes; + outputTypes.insert(outputTypes.end(), runtimeOutputTypes.begin(), + runtimeOutputTypes.end()); + SmallVector outputValues = compiletimeOutputValues; + outputValues.insert(outputValues.end(), runtimeOutputValues.begin(), + runtimeOutputValues.end()); + modifyComputeFunc(context, builder, topOp, topFunc, block, constArgsIndexes, + outputTypes, outputValues); + } else { + std::unordered_set constArgsIndexes = compiletimeConstArgsIndexes; + constArgsIndexes.merge(runtimeConstArgsIndexes); + + // ===== build runtime folding function ===== + SmallVector inputTypes; // types of constant tensors + // values of constant tensors in original block + SmallVector inputValues; + SmallVector outputTypes; // types of folded constant tensors + // values of folded constant tensors in original block + SmallVector outputValues; + getArithConstantOutputs(block, outputTypes, outputValues); + getInputsAndOutputs(block, constArgsIndexes, inputTypes, inputValues, + outputTypes, outputValues); + + func::FuncOp foldFunc = + buildFoldFunc(context, builder, topOp, "runtime_fold", constOps, + inputTypes, inputValues, outputTypes, outputValues); + (void)foldFunc; + canonicalizeAndClean(context, foldFunc.getOperation()); + + // ===== build computing function ===== + modifyComputeFunc(context, builder, topOp, topFunc, block, constArgsIndexes, + outputTypes, outputValues); + } canonicalizeAndClean(context, topOp); } diff --git a/test/gc/Transforms/test_constant_tensor_folding-2.mlir b/test/gc/Transforms/test_constant_tensor_folding-2.mlir index 85208815e..a5e123085 100644 --- a/test/gc/Transforms/test_constant_tensor_folding-2.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding-2.mlir @@ -60,16 +60,26 @@ module { } } +// COM: If enable compile time folding, // COM: 1 pack in entry for input feature, // COM: 4 packs in compiletime_fold for 2 weights, -// COM: 2 packs in runtime_fold for 1 weights +// COM: 2 packs in runtime_fold for 1 weights: +// COM: CHECK: tensor.pack +// COM: CHECK: func.func @compiletime_fold +// COM: CHECK: tensor.pack +// COM: CHECK: tensor.pack +// COM: CHECK: tensor.pack +// COM: CHECK: tensor.pack +// COM: CHECK: func.func @runtime_fold +// COM: CHECK: tensor.pack +// COM: CHECK: tensor.pack +// COM: else, // CHECK: tensor.pack -// CHECK: func.func @compiletime_fold +// CHECK: func.func @runtime_fold // CHECK: tensor.pack // CHECK: tensor.pack // CHECK: tensor.pack // CHECK: tensor.pack -// CHECK: func.func @runtime_fold // CHECK: tensor.pack // CHECK: tensor.pack From 1fc3b9f2c28e4ce99d15956b56fd2794ea4362a0 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Sun, 28 Jul 2024 22:33:05 -0700 Subject: [PATCH 20/29] Rename and polish --- .../DataFlow/ConstantSubgraphAnalyser.h | 66 +++++++++---------- .../DataFlow/ConstantSubgraphAnalyser.cpp | 41 ++++++------ .../Transforms/ConstantSubgraphAnalysis.cpp | 2 +- lib/gc/Transforms/ConstantTensorFolding.cpp | 16 +++-- 4 files changed, 64 insertions(+), 61 deletions(-) diff --git a/include/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h b/include/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h index d2dc4ffa4..288ee74c4 100644 --- a/include/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h +++ b/include/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h @@ -5,68 +5,66 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// -// -// This file implements constant subgraph analysis. In this file are: -// 1. the lattice value class that represents operations with constant inputs -// and outputs in the program, and -// 2. a sparse constant subgraph analysis. -// -//===----------------------------------------------------------------------===// +/// +/// This file implements constant subgraph analysis. In this file are: +/// 1. the lattice value class that represents operations with constant inputs +/// and outputs in the program, and +/// 2. a sparse constant subgraph analysis. +/// +///===----------------------------------------------------------------------===// #ifndef MLIR_ANALYSIS_DATAFLOW_CONSTANTSUBGRAPHANALYSER_H #define MLIR_ANALYSIS_DATAFLOW_CONSTANTSUBGRAPHANALYSER_H #include "mlir/Analysis/DataFlow/SparseAnalysis.h" -#include namespace mlir { namespace dataflow { //===----------------------------------------------------------------------===// -// InConstantSubgraph +// IsConstantTensor //===----------------------------------------------------------------------===// -/// This lattice represents a boolean integer indicating if an operation is with -/// constant inputs and constant outputs and hence in constant subgraph. -class InConstantSubgraph { +/// This lattice represents a boolean indicating if a value is constant. +class IsConstantTensor { public: /// Construct as uninitialized. - explicit InConstantSubgraph() = default; + explicit IsConstantTensor() = default; /// Construct with a known state. - explicit InConstantSubgraph(bool initialized, bool inConstantSubgraph) - : initialized(initialized), inConstantSubgraph(inConstantSubgraph) {} + explicit IsConstantTensor(bool initialized, bool isConstantTensor) + : initialized(initialized), isConstantTensor(isConstantTensor) {} - /// Get the state. Returns null if no value was determined. - bool getInConstantSubgraph() const { + /// Get the state. Must be initialized before. + bool getIsConstantTensor() const { assert(!isUninitialized()); - return inConstantSubgraph; + return isConstantTensor; } /// Compare. - bool operator==(const InConstantSubgraph &rhs) const { + bool operator==(const IsConstantTensor &rhs) const { return initialized == rhs.initialized && - inConstantSubgraph == rhs.inConstantSubgraph; + isConstantTensor == rhs.isConstantTensor; } void print(raw_ostream &os) const; /// Get uninitialized state. This happens when the /// state hasn't been set during the analysis. - static InConstantSubgraph getUninitialized() { return InConstantSubgraph{}; } + static IsConstantTensor getUninitialized() { return IsConstantTensor{}; } /// Whether the state is uninitialized. bool isUninitialized() const { return !initialized; } /// Get unknown state. - static InConstantSubgraph getUnknown() { - return InConstantSubgraph{/*initialized=*/false, - /*inConstantSubgraph=*/false}; + static IsConstantTensor getUnknown() { + return IsConstantTensor{/*initialized=*/false, + /*isConstantTensor*/ false}; } // Join two states. - static InConstantSubgraph join(const InConstantSubgraph &lhs, - const InConstantSubgraph &rhs) { + static IsConstantTensor join(const IsConstantTensor &lhs, + const IsConstantTensor &rhs) { // if one is uninitialized, use another if (lhs.isUninitialized()) return rhs; @@ -75,15 +73,15 @@ class InConstantSubgraph { // both are initialized, intersect them if (!lhs.isUninitialized() && !rhs.isUninitialized()) { - return InConstantSubgraph(true, lhs.getInConstantSubgraph() && - rhs.getInConstantSubgraph()); + return IsConstantTensor(true, lhs.getIsConstantTensor() && + rhs.getIsConstantTensor()); } return getUninitialized(); } private: bool initialized = false; - bool inConstantSubgraph = false; + bool isConstantTensor = false; }; //===----------------------------------------------------------------------===// @@ -91,15 +89,15 @@ class InConstantSubgraph { //===----------------------------------------------------------------------===// class ConstantSubgraphAnalyser - : public SparseForwardDataFlowAnalysis> { + : public SparseForwardDataFlowAnalysis> { public: using SparseForwardDataFlowAnalysis::SparseForwardDataFlowAnalysis; void visitOperation(Operation *op, - ArrayRef *> operands, - ArrayRef *> results) override; + ArrayRef *> operands, + ArrayRef *> results) override; - void setToEntryState(Lattice *lattice) override; + void setToEntryState(Lattice *lattice) override; }; //===----------------------------------------------------------------------===// @@ -113,7 +111,7 @@ struct RunConstantSubgraphAnalyser { void run(Operation *op); - bool getInConstantSubgraph(Value val); + bool getIsConstantTensor(Value val); private: /// Stores the result of the analysis. diff --git a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp index 640b3ef59..ff291d6b0 100644 --- a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp +++ b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp @@ -35,15 +35,15 @@ using namespace mlir; using namespace mlir::dataflow; //===----------------------------------------------------------------------===// -// InConstantSubgraph +// IsConstantTensor //===----------------------------------------------------------------------===// -void InConstantSubgraph::print(raw_ostream &os) const { +void IsConstantTensor::print(raw_ostream &os) const { if (isUninitialized()) { os << ""; return; } - os << getInConstantSubgraph(); + os << getIsConstantTensor(); } //===----------------------------------------------------------------------===// @@ -51,8 +51,8 @@ void InConstantSubgraph::print(raw_ostream &os) const { //===----------------------------------------------------------------------===// void ConstantSubgraphAnalyser::visitOperation( - Operation *op, ArrayRef *> operands, - ArrayRef *> results) { + Operation *op, ArrayRef *> operands, + ArrayRef *> results) { LLVM_DEBUG(llvm::dbgs() << "ConstantSubgraphAnalyser: Visiting operation:\n" << *op << "\n"); @@ -67,7 +67,7 @@ void ConstantSubgraphAnalyser::visitOperation( LLVM_DEBUG(llvm::dbgs() << "Curr op has " << operands.size() << " operands, check if constant\n"); for (auto *operandLattice : operands) { - auto operandState = operandLattice->getValue().getInConstantSubgraph(); + auto operandState = operandLattice->getValue().getIsConstantTensor(); LLVM_DEBUG(llvm::dbgs() << "Operand: " << operandLattice->getPoint() << ", lattice value: " << operandState << "\n"); if (!operandState) { @@ -81,20 +81,18 @@ void ConstantSubgraphAnalyser::visitOperation( if (!in) { LLVM_DEBUG(llvm::dbgs() << "Curr op not in constant subgraph\n"); for (auto lattice : results) { - propagateIfChanged(lattice, - lattice->join(InConstantSubgraph(true, false))); + propagateIfChanged(lattice, lattice->join(IsConstantTensor(true, false))); } } else { LLVM_DEBUG(llvm::dbgs() << "Curr op in constant subgraph\n"); for (auto lattice : results) { - propagateIfChanged(lattice, - lattice->join(InConstantSubgraph(true, true))); + propagateIfChanged(lattice, lattice->join(IsConstantTensor(true, true))); } } } void ConstantSubgraphAnalyser::setToEntryState( - Lattice *lattice) { + Lattice *lattice) { if (auto blockArg = cast(lattice->getPoint())) { auto parentOp = blockArg.getParentBlock()->getParentOp(); auto parentOpAttr = parentOp->getAttrDictionary(); @@ -119,14 +117,13 @@ void ConstantSubgraphAnalyser::setToEntryState( if (constArgsIndexes.count(blockArg.getArgNumber())) { LLVM_DEBUG(llvm::dbgs() << "Block argument: " << blockArg << " is marked as constant\n"); - propagateIfChanged(lattice, - lattice->join(InConstantSubgraph(true, true))); + propagateIfChanged(lattice, lattice->join(IsConstantTensor(true, true))); return; } - propagateIfChanged(lattice, lattice->join(InConstantSubgraph(true, false))); + propagateIfChanged(lattice, lattice->join(IsConstantTensor(true, false))); } else { propagateIfChanged(lattice, - lattice->join(InConstantSubgraph::getUninitialized())); + lattice->join(IsConstantTensor::getUninitialized())); } } @@ -149,13 +146,13 @@ void RunConstantSubgraphAnalyser::getConstantSubgraph(DataFlowSolver &solver, continue; } for (Value res : op.getResults()) { - auto *lattice = solver.lookupState>(res); + auto *lattice = solver.lookupState>(res); if (!lattice || lattice->getValue().isUninitialized()) { resultsAllConstant = false; break; } - const InConstantSubgraph &latticeValue = lattice->getValue(); - if (!latticeValue.getInConstantSubgraph()) { + const IsConstantTensor &latticeValue = lattice->getValue(); + if (!latticeValue.getIsConstantTensor()) { resultsAllConstant = false; break; } @@ -183,8 +180,8 @@ void RunConstantSubgraphAnalyser::run(Operation *op) { getConstantSubgraph(solver, op); } -bool RunConstantSubgraphAnalyser::getInConstantSubgraph(Value val) { - auto *lattice = solver.lookupState>(val); - const InConstantSubgraph &latticeValue = lattice->getValue(); - return latticeValue.getInConstantSubgraph(); +bool RunConstantSubgraphAnalyser::getIsConstantTensor(Value val) { + auto *lattice = solver.lookupState>(val); + const IsConstantTensor &latticeValue = lattice->getValue(); + return latticeValue.getIsConstantTensor(); } \ No newline at end of file diff --git a/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp b/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp index ed481720b..511d76f21 100644 --- a/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp +++ b/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp @@ -37,7 +37,7 @@ void ConstantSubgraphAnalysis::runOnOperation() { auto &func = op->getRegions().front().getBlocks().front().getOperations().front(); - // Hard-code: set the #1 argument to be constant. + // Hard-code example: set some arguments to be constant. // OpBuilder builder(op->getContext()); // func.setAttr("runtime_const_args_index", // builder.getI32ArrayAttr({1,2,3,4})); diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp index 3f38dda77..d7174ec6e 100644 --- a/lib/gc/Transforms/ConstantTensorFolding.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -10,10 +10,11 @@ // //===----------------------------------------------------------------------===// -#include "mlir/Transforms/Passes.h" #include #include +#include "mlir/Transforms/Passes.h" + #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" @@ -31,6 +32,8 @@ // #include "gc/ExecutionEngine/CPURuntime/ConstantCache.hpp" +#define DEBUG_TYPE "constant-tensor-folding" + namespace mlir { namespace gc { #define GEN_PASS_DEF_CONSTANTTENSORFOLDING @@ -69,6 +72,10 @@ int64_t getTensorSize(TensorType t) { return size; } +/// @brief op has only one operand, or operands of op are one same value, or +/// operands of op are one same value or from tensor.EmptyOp. +/// @param op +/// @return bool singleOperand(Operation *op) { if (op->getNumOperands() > 1) { Value firstOperand = op->getOperand(0); @@ -328,12 +335,12 @@ struct ConstGraphTensorCacheManager { for (size_t size : buffersSize) { totalSize += divideAndCeil(size, 64) * 64; } - llvm::dbgs() << "Alloc total size: " << totalSize << '\n'; + LLVM_DEBUG(llvm::dbgs() << "Alloc total size: " << totalSize << '\n'); // auto base = createConstCacheProxy(totalSize); std::vector globalIds(buffersSize.size()); size_t offset = 0; for (size_t i = 0; i < buffersSize.size(); i++) { - llvm::dbgs() << "Alloc offset: " << offset << '\n'; + LLVM_DEBUG(llvm::dbgs() << "Alloc offset: " << offset << '\n'); // regCachedTensor(cachedTensorGlobalId, base, offset); globalIds[i] = cachedTensorGlobalId; ++cachedTensorGlobalId; @@ -565,7 +572,8 @@ func::FuncOp buildFoldFunc(MLIRContext *context, OpBuilder &builder, // Allocate buffer for outputValuesInFold std::vector buffersSize; for (Value &tensor : outputValuesInFold) { - llvm::dbgs() << "Allocate buffer for tensor: " << tensor << "\n"; + LLVM_DEBUG(llvm::dbgs() + << "Allocate buffer for tensor: " << tensor << "\n"); buffersSize.push_back( getTensorSize(dyn_cast(tensor.getType()))); } From bfc12c71ce4c2a184a0b20206ba604b9d3efb524 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Tue, 6 Aug 2024 19:58:13 -0700 Subject: [PATCH 21/29] Add accuracy tests on mlp --- .../test_constant_tensor_folding_bf16_4D5D.py | 101 +++++++ ...constant_tensor_folding_bf16_two_layers.py | 258 ++++++++++++++++++ .../test_constant_tensor_folding_f32_4D4D.py | 96 +++++++ ..._constant_tensor_folding_f32_two_layers.py | 225 +++++++++++++++ 4 files changed, 680 insertions(+) create mode 100644 test/gc/Transforms/test_constant_tensor_folding_bf16_4D5D.py create mode 100644 test/gc/Transforms/test_constant_tensor_folding_bf16_two_layers.py create mode 100644 test/gc/Transforms/test_constant_tensor_folding_f32_4D4D.py create mode 100644 test/gc/Transforms/test_constant_tensor_folding_f32_two_layers.py diff --git a/test/gc/Transforms/test_constant_tensor_folding_bf16_4D5D.py b/test/gc/Transforms/test_constant_tensor_folding_bf16_4D5D.py new file mode 100644 index 000000000..0fafbd080 --- /dev/null +++ b/test/gc/Transforms/test_constant_tensor_folding_bf16_4D5D.py @@ -0,0 +1,101 @@ +################################################################################ +# Copyright (C) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +################################################################################ + +from enum import Flag +import os +import sys +import ml_dtypes +import numpy as np +from gc_mlir import ir +from gc_mlir.graph_compiler import GraphCompiler +from numpy.testing import assert_allclose + +project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if project_dir not in sys.path: + sys.path.insert(0, project_dir) + +import torch +# from bench import py_timeit_bench +from utils import get_mlir_args + +if __name__ == "__main__": + with ir.Context() as ctx: + ctx.allow_unregistered_dialects = True + + M = 64 + N = 256 + K = 512 + MBlock = 32 + NBlock = 32 + KBlock = 32 + vnni_size = 2 + shapeA = [M // MBlock, K // KBlock, MBlock, KBlock] + shapeB = [N // NBlock, K // KBlock, KBlock // vnni_size, NBlock, vnni_size] + shapeC = [M // MBlock, N // NBlock, MBlock, NBlock] + + block_start = "{" + block_end = "}" + mlir_str = f''' +#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d4, d6)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2, d6 floordiv 2, d5, d3)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5)> +module {block_start} + func.func @entry(%arg0: tensor<{M // MBlock}x{K // KBlock}x{MBlock}x{KBlock}xbf16>, %cst: tensor<{N // NBlock}x{K // KBlock}x{KBlock // vnni_size}x{NBlock}x{vnni_size}xbf16>) -> tensor<{M // MBlock}x{N // NBlock}x{MBlock}x{NBlock}xbf16> attributes {block_start}llvm.emit_c_interface{block_end} {block_start} + %cst_0 = arith.constant 0.000000e+00 : bf16 + %0 = tensor.empty() : tensor<{M // MBlock}x{N // NBlock}x{MBlock}x{NBlock}xbf16> + %1 = linalg.fill ins(%cst_0 : bf16) outs(%0 : tensor<{M // MBlock}x{N // NBlock}x{MBlock}x{NBlock}xbf16>) -> tensor<{M // MBlock}x{N // NBlock}x{MBlock}x{NBlock}xbf16> + %2 = linalg.generic {block_start}indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]{block_end} ins(%arg0, %cst : tensor<{M // MBlock}x{K // KBlock}x{MBlock}x{KBlock}xbf16>, tensor<{N // NBlock}x{K // KBlock}x{KBlock // vnni_size}x{NBlock}x{vnni_size}xbf16>) outs(%1 : tensor<{M // MBlock}x{N // NBlock}x{MBlock}x{NBlock}xbf16>) {block_start} + ^bb0(%in: bf16, %in_1: bf16, %out: bf16): + %3 = arith.mulf %in, %in_1 : bf16 + %4 = arith.addf %out, %3 : bf16 + linalg.yield %4 : bf16 + {block_end} -> tensor<{M // MBlock}x{N // NBlock}x{MBlock}x{NBlock}xbf16> + return %2 : tensor<{M // MBlock}x{N // NBlock}x{MBlock}x{NBlock}xbf16> + {block_end} +{block_end} + ''' + print(mlir_str) + + # 4D x 5D, inputs transposed + module_in = ir.Module.parse(mlir_str) + + # entry(%transposed: tensor<2x16x32x32xbf16>, %transposed_5: tensor<8x16x16x32x2xbf16>) -> tensor<2x8x32x32xbf16> + torch_arg0 = torch.rand((M, K), dtype=torch.bfloat16) + torch_arg1 = torch.rand((K, N), dtype=torch.bfloat16) + ref_res = torch_arg0 @ torch_arg1 + + passes = "any(gc-cpu-pipeline)" + shared_libs = [ + os.environ["MLIR_C_RUNNER_UTILS"], + os.environ["MLIR_RUNNER_UTILS"], + ] + compiler = GraphCompiler(passes) + ctx.enable_multithreading(False) + + arg0 = torch_arg0.view(shapeA).permute([0, 2, 1, 3]).contiguous() # MK -> MKmk + np_arg0 = arg0.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16) + arg1 = torch_arg1.view(shapeB).permute([3, 0, 1, 4, 2]).contiguous() # KN -> NKkn2k + np_arg1 = arg1.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16) + gc_res = np.ones(shapeC, dtype=ml_dtypes.bfloat16) + + entry = "entry" + mlir_args = get_mlir_args(module_in, entry, [np_arg0, np_arg1, gc_res]) + engine_in = compiler.compile_and_jit(module_in, ir_printing=False) + engine_in.invoke(entry, *mlir_args) + gc_res = np.reshape(np.transpose(gc_res, (0, 2, 1, 3)), (M, N)) # MNmn -> MN + + assert_allclose(gc_res.astype(np.float32), ref_res.to(torch.float32).numpy(), rtol=1e-5, atol=1e-5) diff --git a/test/gc/Transforms/test_constant_tensor_folding_bf16_two_layers.py b/test/gc/Transforms/test_constant_tensor_folding_bf16_two_layers.py new file mode 100644 index 000000000..d444416e7 --- /dev/null +++ b/test/gc/Transforms/test_constant_tensor_folding_bf16_two_layers.py @@ -0,0 +1,258 @@ +################################################################################ +# Copyright (C) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +################################################################################ + +import os +import sys + +import numpy as np +import ml_dtypes + +from gc_mlir import ir +from gc_mlir.graph_compiler import GraphCompiler +from numpy.testing import assert_allclose + +project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if project_dir not in sys.path: + sys.path.insert(0, project_dir) + +import torch +# from bench import py_timeit_bench +from utils import get_mlir_args + +if __name__ == "__main__": + with ir.Context() as ctx: + ctx.allow_unregistered_dialects = True + # ctx.enable_multithreading = False + module_in = ir.Module.parse( + """ +#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d4, d6)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2, d6 floordiv 2, d5, d3)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5)> +#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#map4 = affine_map<(d0, d1, d2, d3) -> (d1, d3)> +module { + func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, runtime_const_args_index = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { + %0 = tensor.empty() : tensor<2x16x32x32xbf16> + %cst = arith.constant 0.000000e+00 : bf16 + %padded = tensor.pad %arg0 low[0, 0] high[0, 0] { + ^bb0(%arg5: index, %arg6: index): + tensor.yield %cst : bf16 + } : tensor<64x512xbf16> to tensor<64x512xbf16> + %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [2, 32, 16, 32] : tensor<64x512xbf16> into tensor<2x32x16x32xbf16> + %transposed = linalg.transpose ins(%expanded : tensor<2x32x16x32xbf16>) outs(%0 : tensor<2x16x32x32xbf16>) permutation = [0, 2, 1, 3] + %1 = tensor.empty() : tensor<8x16x32x32xbf16> + %padded_0 = tensor.pad %arg1 low[0, 0] high[0, 0] { + ^bb0(%arg5: index, %arg6: index): + tensor.yield %cst : bf16 + } : tensor<512x256xbf16> to tensor<512x256xbf16> + %expanded_1 = tensor.expand_shape %padded_0 [[0, 1], [2, 3]] output_shape [16, 32, 8, 32] : tensor<512x256xbf16> into tensor<16x32x8x32xbf16> + %transposed_2 = linalg.transpose ins(%expanded_1 : tensor<16x32x8x32xbf16>) outs(%1 : tensor<8x16x32x32xbf16>) permutation = [2, 0, 1, 3] + %2 = tensor.empty() : tensor<8x16x16x32x2xbf16> + %padded_3 = tensor.pad %transposed_2 low[0, 0, 0, 0] high[0, 0, 0, 0] { + ^bb0(%arg5: index, %arg6: index, %arg7: index, %arg8: index): + tensor.yield %cst : bf16 + } : tensor<8x16x32x32xbf16> to tensor<8x16x32x32xbf16> + %expanded_4 = tensor.expand_shape %padded_3 [[0], [1], [2, 3], [4]] output_shape [8, 16, 16, 2, 32] : tensor<8x16x32x32xbf16> into tensor<8x16x16x2x32xbf16> + %transposed_5 = linalg.transpose ins(%expanded_4 : tensor<8x16x16x2x32xbf16>) outs(%2 : tensor<8x16x16x32x2xbf16>) permutation = [0, 1, 2, 4, 3] + %3 = tensor.empty() : tensor<2x8x32x32xbf16> + %4 = linalg.fill ins(%cst : bf16) outs(%3 : tensor<2x8x32x32xbf16>) -> tensor<2x8x32x32xbf16> + %5 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%transposed, %transposed_5 : tensor<2x16x32x32xbf16>, tensor<8x16x16x32x2xbf16>) outs(%4 : tensor<2x8x32x32xbf16>) { + ^bb0(%in: bf16, %in_19: bf16, %out: bf16): + %17 = arith.mulf %in, %in_19 : bf16 + %18 = arith.addf %out, %17 : bf16 + linalg.yield %18 : bf16 + } -> tensor<2x8x32x32xbf16> + %6 = tensor.empty() : tensor<8x32xbf16> + %padded_6 = tensor.pad %arg2 low[0] high[0] { + ^bb0(%arg5: index): + tensor.yield %cst : bf16 + } : tensor<256xbf16> to tensor<256xbf16> + %expanded_7 = tensor.expand_shape %padded_6 [[0, 1]] output_shape [8, 32] : tensor<256xbf16> into tensor<8x32xbf16> + %transposed_8 = linalg.transpose ins(%expanded_7 : tensor<8x32xbf16>) outs(%6 : tensor<8x32xbf16>) permutation = [0, 1] + %broadcasted = linalg.broadcast ins(%transposed_8 : tensor<8x32xbf16>) outs(%3 : tensor<2x8x32x32xbf16>) dimensions = [0, 2] + %7 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%broadcasted : tensor<2x8x32x32xbf16>) outs(%5 : tensor<2x8x32x32xbf16>) { + ^bb0(%in: bf16, %out: bf16): + %17 = arith.addf %in, %out : bf16 + linalg.yield %17 : bf16 + } -> tensor<2x8x32x32xbf16> + %8 = tensor.empty() : tensor<32x8x32x32xbf16> + %padded_9 = tensor.pad %arg3 low[0, 0] high[0, 0] { + ^bb0(%arg5: index, %arg6: index): + tensor.yield %cst : bf16 + } : tensor<256x1024xbf16> to tensor<256x1024xbf16> + %expanded_10 = tensor.expand_shape %padded_9 [[0, 1], [2, 3]] output_shape [8, 32, 32, 32] : tensor<256x1024xbf16> into tensor<8x32x32x32xbf16> + %transposed_11 = linalg.transpose ins(%expanded_10 : tensor<8x32x32x32xbf16>) outs(%8 : tensor<32x8x32x32xbf16>) permutation = [2, 0, 1, 3] + %9 = tensor.empty() : tensor<32x8x16x32x2xbf16> + %padded_12 = tensor.pad %transposed_11 low[0, 0, 0, 0] high[0, 0, 0, 0] { + ^bb0(%arg5: index, %arg6: index, %arg7: index, %arg8: index): + tensor.yield %cst : bf16 + } : tensor<32x8x32x32xbf16> to tensor<32x8x32x32xbf16> + %expanded_13 = tensor.expand_shape %padded_12 [[0], [1], [2, 3], [4]] output_shape [32, 8, 16, 2, 32] : tensor<32x8x32x32xbf16> into tensor<32x8x16x2x32xbf16> + %transposed_14 = linalg.transpose ins(%expanded_13 : tensor<32x8x16x2x32xbf16>) outs(%9 : tensor<32x8x16x32x2xbf16>) permutation = [0, 1, 2, 4, 3] + %10 = tensor.empty() : tensor<2x32x32x32xbf16> + %11 = linalg.fill ins(%cst : bf16) outs(%10 : tensor<2x32x32x32xbf16>) -> tensor<2x32x32x32xbf16> + %12 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%7, %transposed_14 : tensor<2x8x32x32xbf16>, tensor<32x8x16x32x2xbf16>) outs(%11 : tensor<2x32x32x32xbf16>) { + ^bb0(%in: bf16, %in_19: bf16, %out: bf16): + %17 = arith.mulf %in, %in_19 : bf16 + %18 = arith.addf %out, %17 : bf16 + linalg.yield %18 : bf16 + } -> tensor<2x32x32x32xbf16> + %13 = tensor.empty() : tensor<32x32xbf16> + %padded_15 = tensor.pad %arg4 low[0] high[0] { + ^bb0(%arg5: index): + tensor.yield %cst : bf16 + } : tensor<1024xbf16> to tensor<1024xbf16> + %expanded_16 = tensor.expand_shape %padded_15 [[0, 1]] output_shape [32, 32] : tensor<1024xbf16> into tensor<32x32xbf16> + %transposed_17 = linalg.transpose ins(%expanded_16 : tensor<32x32xbf16>) outs(%13 : tensor<32x32xbf16>) permutation = [0, 1] + %14 = linalg.generic {indexing_maps = [#map4, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_17 : tensor<32x32xbf16>) outs(%12 : tensor<2x32x32x32xbf16>) { + ^bb0(%in: bf16, %out: bf16): + %17 = arith.addf %in, %out : bf16 + linalg.yield %17 : bf16 + } -> tensor<2x32x32x32xbf16> + %15 = tensor.empty() : tensor<64x1024xbf16> + %transposed_18 = linalg.transpose ins(%14 : tensor<2x32x32x32xbf16>) outs(%10 : tensor<2x32x32x32xbf16>) permutation = [0, 2, 1, 3] + %collapsed = tensor.collapse_shape %transposed_18 [[0, 1], [2, 3]] : tensor<2x32x32x32xbf16> into tensor<64x1024xbf16> + %extracted_slice = tensor.extract_slice %collapsed[0, 0] [64, 1024] [1, 1] : tensor<64x1024xbf16> to tensor<64x1024xbf16> + %16 = linalg.copy ins(%extracted_slice : tensor<64x1024xbf16>) outs(%15 : tensor<64x1024xbf16>) -> tensor<64x1024xbf16> + return %16 : tensor<64x1024xbf16> + } +} + """ + ) + module_out = ir.Module.parse( + """ +#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d4, d6)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2, d6 floordiv 2, d5, d3)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5)> +#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#map4 = affine_map<(d0, d1, d2, d3) -> (d1, d3)> +module { + llvm.mlir.global external constant @__num_orig_num_args(5 : i32) {addr_space = 0 : i32} : i32 + llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32> + llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32> + llvm.mlir.global external constant @__runtime_fold_buffer_ids_(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64> + func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, runtime_const_args_index = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { + %cst = arith.constant 0.000000e+00 : bf16 + %0 = tensor.empty() : tensor<2x16x32x32xbf16> + %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [2, 32, 16, 32] : tensor<64x512xbf16> into tensor<2x32x16x32xbf16> + %transposed = linalg.transpose ins(%expanded : tensor<2x32x16x32xbf16>) outs(%0 : tensor<2x16x32x32xbf16>) permutation = [0, 2, 1, 3] + %1 = tensor.empty() : tensor<2x8x32x32xbf16> + %2 = linalg.fill ins(%cst : bf16) outs(%1 : tensor<2x8x32x32xbf16>) -> tensor<2x8x32x32xbf16> + %3 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%transposed, %arg1 : tensor<2x16x32x32xbf16>, tensor<8x16x16x32x2xbf16>) outs(%2 : tensor<2x8x32x32xbf16>) { + ^bb0(%in: bf16, %in_1: bf16, %out: bf16): + %11 = arith.mulf %in, %in_1 : bf16 + %12 = arith.addf %out, %11 : bf16 + linalg.yield %12 : bf16 + } -> tensor<2x8x32x32xbf16> + %broadcasted = linalg.broadcast ins(%arg2 : tensor<8x32xbf16>) outs(%1 : tensor<2x8x32x32xbf16>) dimensions = [0, 2] + %4 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%broadcasted : tensor<2x8x32x32xbf16>) outs(%3 : tensor<2x8x32x32xbf16>) { + ^bb0(%in: bf16, %out: bf16): + %11 = arith.addf %in, %out : bf16 + linalg.yield %11 : bf16 + } -> tensor<2x8x32x32xbf16> + %5 = tensor.empty() : tensor<2x32x32x32xbf16> + %6 = linalg.fill ins(%cst : bf16) outs(%5 : tensor<2x32x32x32xbf16>) -> tensor<2x32x32x32xbf16> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%4, %arg3 : tensor<2x8x32x32xbf16>, tensor<32x8x16x32x2xbf16>) outs(%6 : tensor<2x32x32x32xbf16>) { + ^bb0(%in: bf16, %in_1: bf16, %out: bf16): + %11 = arith.mulf %in, %in_1 : bf16 + %12 = arith.addf %out, %11 : bf16 + linalg.yield %12 : bf16 + } -> tensor<2x32x32x32xbf16> + %8 = linalg.generic {indexing_maps = [#map4, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg4 : tensor<32x32xbf16>) outs(%7 : tensor<2x32x32x32xbf16>) { + ^bb0(%in: bf16, %out: bf16): + %11 = arith.addf %in, %out : bf16 + linalg.yield %11 : bf16 + } -> tensor<2x32x32x32xbf16> + %9 = tensor.empty() : tensor<64x1024xbf16> + %transposed_0 = linalg.transpose ins(%8 : tensor<2x32x32x32xbf16>) outs(%5 : tensor<2x32x32x32xbf16>) permutation = [0, 2, 1, 3] + %collapsed = tensor.collapse_shape %transposed_0 [[0, 1], [2, 3]] : tensor<2x32x32x32xbf16> into tensor<64x1024xbf16> + %10 = linalg.copy ins(%collapsed : tensor<64x1024xbf16>) outs(%9 : tensor<64x1024xbf16>) -> tensor<64x1024xbf16> + return %10 : tensor<64x1024xbf16> + } + func.func @runtime_fold(%arg0: tensor<512x256xbf16>, %arg1: tensor<256xbf16>, %arg2: tensor<256x1024xbf16>, %arg3: tensor<1024xbf16>) -> (tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16>) attributes {llvm.emit_c_interface} { + %0 = tensor.empty() : tensor<8x16x32x32xbf16> + %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [16, 32, 8, 32] : tensor<512x256xbf16> into tensor<16x32x8x32xbf16> + %transposed = linalg.transpose ins(%expanded : tensor<16x32x8x32xbf16>) outs(%0 : tensor<8x16x32x32xbf16>) permutation = [2, 0, 1, 3] + %1 = tensor.empty() : tensor<8x16x16x32x2xbf16> + %expanded_0 = tensor.expand_shape %transposed [[0], [1], [2, 3], [4]] output_shape [8, 16, 16, 2, 32] : tensor<8x16x32x32xbf16> into tensor<8x16x16x2x32xbf16> + %transposed_1 = linalg.transpose ins(%expanded_0 : tensor<8x16x16x2x32xbf16>) outs(%1 : tensor<8x16x16x32x2xbf16>) permutation = [0, 1, 2, 4, 3] + %expanded_2 = tensor.expand_shape %arg1 [[0, 1]] output_shape [8, 32] : tensor<256xbf16> into tensor<8x32xbf16> + %2 = tensor.empty() : tensor<32x8x32x32xbf16> + %expanded_3 = tensor.expand_shape %arg2 [[0, 1], [2, 3]] output_shape [8, 32, 32, 32] : tensor<256x1024xbf16> into tensor<8x32x32x32xbf16> + %transposed_4 = linalg.transpose ins(%expanded_3 : tensor<8x32x32x32xbf16>) outs(%2 : tensor<32x8x32x32xbf16>) permutation = [2, 0, 1, 3] + %3 = tensor.empty() : tensor<32x8x16x32x2xbf16> + %expanded_5 = tensor.expand_shape %transposed_4 [[0], [1], [2, 3], [4]] output_shape [32, 8, 16, 2, 32] : tensor<32x8x32x32xbf16> into tensor<32x8x16x2x32xbf16> + %transposed_6 = linalg.transpose ins(%expanded_5 : tensor<32x8x16x2x32xbf16>) outs(%3 : tensor<32x8x16x32x2xbf16>) permutation = [0, 1, 2, 4, 3] + %expanded_7 = tensor.expand_shape %arg3 [[0, 1]] output_shape [32, 32] : tensor<1024xbf16> into tensor<32x32xbf16> + return %transposed_1, %expanded_2, %transposed_6, %expanded_7 : tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16> + } +} + """ + ) + + # module_in entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> + torch_arg0 = torch.rand((64, 512), dtype=torch.bfloat16) + torch_arg1 = torch.rand((512, 256), dtype=torch.bfloat16) + torch_arg2 = torch.rand((256), dtype=torch.bfloat16) + torch_arg3 = torch.rand((256, 1024), dtype=torch.bfloat16) + torch_arg4 = torch.rand((1024), dtype=torch.bfloat16) + + ref_res = (torch_arg0 @ torch_arg1 + torch_arg2) @ torch_arg3 + torch_arg4 + + passes = "any(gc-cpu-pipeline)" + compiler = GraphCompiler(passes) + ctx.enable_multithreading(False) + + arg0 = torch_arg0.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16) + arg1 = torch_arg1.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16) + arg2 = torch_arg2.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16) + arg3 = torch_arg3.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16) + arg4 = torch_arg4.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16) + gc_res = np.ones((64, 1024), dtype=ml_dtypes.bfloat16) + + entry = "entry" + mlir_args = get_mlir_args(module_in, entry, [arg0, arg1, arg2, arg3, arg4, gc_res]) + engine_in = compiler.compile_and_jit(module_in, ir_printing=True) + engine_in.invoke(entry, *mlir_args) + + assert_allclose(gc_res.astype(np.float32), ref_res.to(torch.float32).numpy(), rtol=1e-5, atol=1e-5) + + + # module_out entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> + # module_out runtime_fold(%arg0: tensor<512x256xbf16>, %arg1: tensor<256xbf16>, %arg2: tensor<256x1024xbf16>, %arg3: tensor<1024xbf16>) -> (tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16>) + fold_arg0 = arg1 + fold_arg1 = arg2 + fold_arg2 = arg3 + fold_arg3 = arg4 + fold_res0 = np.zeros((8, 16, 16, 32, 2), dtype=ml_dtypes.bfloat16) + fold_res1 = np.zeros((8, 32), dtype=ml_dtypes.bfloat16) + fold_res2 = np.zeros((32, 8, 16, 32, 2), dtype=ml_dtypes.bfloat16) + fold_res3 = np.zeros((32, 32), dtype=ml_dtypes.bfloat16) + + runtime_fold = "runtime_fold" + fold_mlir_args = get_mlir_args(module_out, runtime_fold, [fold_arg0, fold_arg1, fold_arg2, fold_arg3, fold_res0, fold_res1, fold_res2, fold_res3]) + + gc_res_out = np.zeros((64, 1024), dtype=ml_dtypes.bfloat16) + entry = "entry" + mlir_args = get_mlir_args(module_out, entry, [arg0, fold_res0, fold_res1, fold_res2, fold_res3, gc_res_out]) + + engine_out = compiler.compile_and_jit(module_out, ir_printing=True) + engine_out.invoke(runtime_fold, *fold_mlir_args) + engine_out.invoke(entry, *mlir_args) + + assert_allclose(gc_res.astype(np.float32), gc_res_out.astype(np.float32), rtol=1e-5, atol=1e-5) + diff --git a/test/gc/Transforms/test_constant_tensor_folding_f32_4D4D.py b/test/gc/Transforms/test_constant_tensor_folding_f32_4D4D.py new file mode 100644 index 000000000..465d390fd --- /dev/null +++ b/test/gc/Transforms/test_constant_tensor_folding_f32_4D4D.py @@ -0,0 +1,96 @@ +################################################################################ +# Copyright (C) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +################################################################################ + +from enum import Flag +import os +import sys + +import numpy as np +from gc_mlir import ir +from gc_mlir.graph_compiler import GraphCompiler +from numpy.testing import assert_allclose + +project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if project_dir not in sys.path: + sys.path.insert(0, project_dir) + +import torch +# from bench import py_timeit_bench +from utils import get_mlir_args + +if __name__ == "__main__": + with ir.Context() as ctx: + ctx.allow_unregistered_dialects = True + + M = 64 + N = 256 + K = 512 + MBlock = 32 + NBlock = 32 + KBlock = 32 + vnni_size = 1 + shapeA = [M // MBlock, K // KBlock, MBlock, KBlock] + shapeB = [N // NBlock, K // KBlock, KBlock, NBlock] + shapeC = [M // MBlock, N // NBlock, MBlock, NBlock] + + # 4D x 4D, inputs transposed + mlir_str = """ +#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> +module { + func.func @main_entry(%arg0: tensor<2x16x32x32xf32>, %arg1: tensor<8x16x32x32xf32>) -> tensor<2x8x32x32xf32> attributes {llvm.emit_c_interface} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<2x8x32x32xf32> + %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x8x32x32xf32>) -> tensor<2x8x32x32xf32> + %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<2x16x32x32xf32>, tensor<8x16x32x32xf32>) outs(%1 : tensor<2x8x32x32xf32>) { + ^bb0(%in: f32, %in_0: f32, %out: f32): + %3 = arith.mulf %in, %in_0 : f32 + %4 = arith.addf %out, %3 : f32 + linalg.yield %4 : f32 + } -> tensor<2x8x32x32xf32> + return %2 : tensor<2x8x32x32xf32> + } +} + """ + module = ir.Module.parse(mlir_str) + + torch_arg0 = torch.rand((M, K), dtype=torch.float32) + torch_arg1 = torch.rand((K, N), dtype=torch.float32) + ref_res = torch.matmul(torch_arg0, torch_arg1) + + arg0_0 = torch_arg0.view([M // MBlock, MBlock, K // KBlock, KBlock]).permute([0, 2, 1, 3]).contiguous().numpy().view(np.dtype("float32")) + arg0_1 = np.transpose(np.reshape(torch_arg0.contiguous().numpy().view(np.dtype("float32")), (M // MBlock, MBlock, K // KBlock, KBlock)), (0, 2, 1, 3)) # MK -> MKmk + print("arg0_0 arg0_1 close: ", np.allclose(arg0_0, arg0_1, rtol=1e-5, atol=1e-5)) + + arg1 = torch_arg1.view([K // KBlock, KBlock, N // NBlock, NBlock]).permute([2, 0, 1, 3]).contiguous().numpy().view(np.dtype("float32")) + # arg1 = np.transpose(np.reshape(torch_arg1.contiguous().numpy(), (16, 32, 8, 32)), (2, 0, 1, 3)).view(np.dtype("float32")) # KN -> NKkn, 8x16x32x32 + + gc_res = np.ones(shapeC, dtype=np.dtype("float32")) + + entry = "main_entry" + mlir_args = get_mlir_args(module, entry, [arg0_1, arg1, gc_res]) + + passes = "any(gc-cpu-pipeline)" + compiler = GraphCompiler(passes) + engine_in = compiler.compile_and_jit(module) + engine_in.invoke(entry, *mlir_args) + gc_res = np.reshape(np.transpose(gc_res, (0, 2, 1, 3)), (64, 256)) # MNmn -> MN + + print("gc_res ref_res close: ", np.allclose(gc_res, ref_res.to(torch.float32).numpy(), rtol=1e-5, atol=1e-5)) + assert_allclose(gc_res, ref_res.to(torch.float32).numpy(), rtol=1e-5, atol=1e-5) + diff --git a/test/gc/Transforms/test_constant_tensor_folding_f32_two_layers.py b/test/gc/Transforms/test_constant_tensor_folding_f32_two_layers.py new file mode 100644 index 000000000..377e28a36 --- /dev/null +++ b/test/gc/Transforms/test_constant_tensor_folding_f32_two_layers.py @@ -0,0 +1,225 @@ +################################################################################ +# Copyright (C) 2024 Intel Corporation +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions +# and limitations under the License. +# SPDX-License-Identifier: Apache-2.0 +################################################################################ + +import os +import sys + +import numpy as np +from gc_mlir import ir +from gc_mlir.graph_compiler import GraphCompiler +from numpy.testing import assert_allclose + +project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +if project_dir not in sys.path: + sys.path.insert(0, project_dir) + +import torch +# from bench import py_timeit_bench +from utils import get_mlir_args + +if __name__ == "__main__": + with ir.Context() as ctx: + ctx.allow_unregistered_dialects = True + + # 4D x 4D, inputs plain, two layers + mlir_str_4D4D = """ +#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> +#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#map4 = affine_map<(d0, d1, d2, d3) -> (d1, d3)> +module { + func.func @entry(%arg0: tensor<64x512xf32>, %arg1: tensor<512x256xf32>, %arg2: tensor<256xf32>, %arg3: tensor<256x1024xf32>, %arg4: tensor<1024xf32>) -> tensor<64x1024xf32> attributes {llvm.emit_c_interface, runtime_const_args_index = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { + %0 = tensor.empty() : tensor<2x16x32x32xf32> + %cst = arith.constant 0.000000e+00 : f32 + %padded = tensor.pad %arg0 low[0, 0] high[0, 0] { + ^bb0(%arg5: index, %arg6: index): + tensor.yield %cst : f32 + } : tensor<64x512xf32> to tensor<64x512xf32> + %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [2, 32, 16, 32] : tensor<64x512xf32> into tensor<2x32x16x32xf32> + %transposed = linalg.transpose ins(%expanded : tensor<2x32x16x32xf32>) outs(%0 : tensor<2x16x32x32xf32>) permutation = [0, 2, 1, 3] + %1 = tensor.empty() : tensor<8x16x32x32xf32> + %padded_0 = tensor.pad %arg1 low[0, 0] high[0, 0] { + ^bb0(%arg5: index, %arg6: index): + tensor.yield %cst : f32 + } : tensor<512x256xf32> to tensor<512x256xf32> + %expanded_1 = tensor.expand_shape %padded_0 [[0, 1], [2, 3]] output_shape [16, 32, 8, 32] : tensor<512x256xf32> into tensor<16x32x8x32xf32> + %transposed_2 = linalg.transpose ins(%expanded_1 : tensor<16x32x8x32xf32>) outs(%1 : tensor<8x16x32x32xf32>) permutation = [2, 0, 1, 3] + %2 = tensor.empty() : tensor<2x8x32x32xf32> + %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2x8x32x32xf32>) -> tensor<2x8x32x32xf32> + %4 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%transposed, %transposed_2 : tensor<2x16x32x32xf32>, tensor<8x16x32x32xf32>) outs(%3 : tensor<2x8x32x32xf32>) { + ^bb0(%in: f32, %in_8: f32, %out: f32): + %14 = arith.mulf %in, %in_8 : f32 + %15 = arith.addf %out, %14 : f32 + linalg.yield %15 : f32 + } -> tensor<2x8x32x32xf32> + %expanded_3 = tensor.expand_shape %arg2 [[0, 1]] output_shape [8, 32] : tensor<256xf32> into tensor<8x32xf32> + %broadcasted = linalg.broadcast ins(%expanded_3 : tensor<8x32xf32>) outs(%2 : tensor<2x8x32x32xf32>) dimensions = [0, 2] + %5 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%broadcasted : tensor<2x8x32x32xf32>) outs(%4 : tensor<2x8x32x32xf32>) { + ^bb0(%in: f32, %out: f32): + %14 = arith.addf %in, %out : f32 + linalg.yield %14 : f32 + } -> tensor<2x8x32x32xf32> + %6 = tensor.empty() : tensor<32x8x32x32xf32> + %expanded_4 = tensor.expand_shape %arg3 [[0, 1], [2, 3]] output_shape [8, 32, 32, 32] : tensor<256x1024xf32> into tensor<8x32x32x32xf32> + %transposed_5 = linalg.transpose ins(%expanded_4 : tensor<8x32x32x32xf32>) outs(%6 : tensor<32x8x32x32xf32>) permutation = [2, 0, 1, 3] + %7 = tensor.empty() : tensor<2x32x32x32xf32> + %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x32x32x32xf32>) -> tensor<2x32x32x32xf32> + %9 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%5, %transposed_5 : tensor<2x8x32x32xf32>, tensor<32x8x32x32xf32>) outs(%8 : tensor<2x32x32x32xf32>) { + ^bb0(%in: f32, %in_8: f32, %out: f32): + %14 = arith.mulf %in, %in_8 : f32 + %15 = arith.addf %out, %14 : f32 + linalg.yield %15 : f32 + } -> tensor<2x32x32x32xf32> + %expanded_6 = tensor.expand_shape %arg4 [[0, 1]] output_shape [32, 32] : tensor<1024xf32> into tensor<32x32xf32> + %10 = linalg.generic {indexing_maps = [#map4, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<32x32xf32>) outs(%9 : tensor<2x32x32x32xf32>) { + ^bb0(%in: f32, %out: f32): + %14 = arith.addf %in, %out : f32 + linalg.yield %14 : f32 + } -> tensor<2x32x32x32xf32> + %11 = tensor.empty() : tensor<2x32x32x32xf32> + %transposed_7 = linalg.transpose ins(%10 : tensor<2x32x32x32xf32>) outs(%11 : tensor<2x32x32x32xf32>) permutation = [0, 2, 1, 3] + %collapsed = tensor.collapse_shape %transposed_7 [[0, 1], [2, 3]] : tensor<2x32x32x32xf32> into tensor<64x1024xf32> + %12 = tensor.empty() : tensor<64x1024xf32> + %13 = linalg.copy ins(%collapsed : tensor<64x1024xf32>) outs(%12 : tensor<64x1024xf32>) -> tensor<64x1024xf32> + return %13 : tensor<64x1024xf32> + } +} + """ + + module_in = ir.Module.parse(mlir_str_4D4D) + + + mlir_str_4D4D_out = """ +#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> +#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#map4 = affine_map<(d0, d1, d2, d3) -> (d1, d3)> +module { + llvm.mlir.global external constant @__num_orig_num_args(5 : i32) {addr_space = 0 : i32} : i32 + llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32> + llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32> + llvm.mlir.global external constant @__runtime_fold_buffer_ids_(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64> + func.func @entry(%arg0: tensor<64x512xf32>, %arg1: tensor<8x16x32x32xf32>, %arg2: tensor<8x32xf32>, %arg3: tensor<32x8x32x32xf32>, %arg4: tensor<32x32xf32>) -> tensor<64x1024xf32> attributes {llvm.emit_c_interface, runtime_const_args_index = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { + %cst = arith.constant 0.000000e+00 : f32 + %0 = tensor.empty() : tensor<2x16x32x32xf32> + %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [2, 32, 16, 32] : tensor<64x512xf32> into tensor<2x32x16x32xf32> + %transposed = linalg.transpose ins(%expanded : tensor<2x32x16x32xf32>) outs(%0 : tensor<2x16x32x32xf32>) permutation = [0, 2, 1, 3] + %1 = tensor.empty() : tensor<2x8x32x32xf32> + %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<2x8x32x32xf32>) -> tensor<2x8x32x32xf32> + %3 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%transposed, %arg1 : tensor<2x16x32x32xf32>, tensor<8x16x32x32xf32>) outs(%2 : tensor<2x8x32x32xf32>) { + ^bb0(%in: f32, %in_1: f32, %out: f32): + %12 = arith.mulf %in, %in_1 : f32 + %13 = arith.addf %out, %12 : f32 + linalg.yield %13 : f32 + } -> tensor<2x8x32x32xf32> + %broadcasted = linalg.broadcast ins(%arg2 : tensor<8x32xf32>) outs(%1 : tensor<2x8x32x32xf32>) dimensions = [0, 2] + %4 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%broadcasted : tensor<2x8x32x32xf32>) outs(%3 : tensor<2x8x32x32xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = arith.addf %in, %out : f32 + linalg.yield %12 : f32 + } -> tensor<2x8x32x32xf32> + %5 = tensor.empty() : tensor<2x32x32x32xf32> + %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32x32x32xf32>) -> tensor<2x32x32x32xf32> + %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%4, %arg3 : tensor<2x8x32x32xf32>, tensor<32x8x32x32xf32>) outs(%6 : tensor<2x32x32x32xf32>) { + ^bb0(%in: f32, %in_1: f32, %out: f32): + %12 = arith.mulf %in, %in_1 : f32 + %13 = arith.addf %out, %12 : f32 + linalg.yield %13 : f32 + } -> tensor<2x32x32x32xf32> + %8 = linalg.generic {indexing_maps = [#map4, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg4 : tensor<32x32xf32>) outs(%7 : tensor<2x32x32x32xf32>) { + ^bb0(%in: f32, %out: f32): + %12 = arith.addf %in, %out : f32 + linalg.yield %12 : f32 + } -> tensor<2x32x32x32xf32> + %9 = tensor.empty() : tensor<2x32x32x32xf32> + %transposed_0 = linalg.transpose ins(%8 : tensor<2x32x32x32xf32>) outs(%9 : tensor<2x32x32x32xf32>) permutation = [0, 2, 1, 3] + %collapsed = tensor.collapse_shape %transposed_0 [[0, 1], [2, 3]] : tensor<2x32x32x32xf32> into tensor<64x1024xf32> + %10 = tensor.empty() : tensor<64x1024xf32> + %11 = linalg.copy ins(%collapsed : tensor<64x1024xf32>) outs(%10 : tensor<64x1024xf32>) -> tensor<64x1024xf32> + return %11 : tensor<64x1024xf32> + } + + func.func @runtime_fold(%arg0: tensor<512x256xf32>, %arg1: tensor<256xf32>, %arg2: tensor<256x1024xf32>, %arg3: tensor<1024xf32>) -> (tensor<8x16x32x32xf32>, tensor<8x32xf32>, tensor<32x8x32x32xf32>, tensor<32x32xf32>) attributes {llvm.emit_c_interface} { + %0 = tensor.empty() : tensor<8x16x32x32xf32> + %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [16, 32, 8, 32] : tensor<512x256xf32> into tensor<16x32x8x32xf32> + %transposed = linalg.transpose ins(%expanded : tensor<16x32x8x32xf32>) outs(%0 : tensor<8x16x32x32xf32>) permutation = [2, 0, 1, 3] + %expanded_0 = tensor.expand_shape %arg1 [[0, 1]] output_shape [8, 32] : tensor<256xf32> into tensor<8x32xf32> + %1 = tensor.empty() : tensor<32x8x32x32xf32> + %expanded_1 = tensor.expand_shape %arg2 [[0, 1], [2, 3]] output_shape [8, 32, 32, 32] : tensor<256x1024xf32> into tensor<8x32x32x32xf32> + %transposed_2 = linalg.transpose ins(%expanded_1 : tensor<8x32x32x32xf32>) outs(%1 : tensor<32x8x32x32xf32>) permutation = [2, 0, 1, 3] + %expanded_3 = tensor.expand_shape %arg3 [[0, 1]] output_shape [32, 32] : tensor<1024xf32> into tensor<32x32xf32> + return %transposed, %expanded_0, %transposed_2, %expanded_3 : tensor<8x16x32x32xf32>, tensor<8x32xf32>, tensor<32x8x32x32xf32>, tensor<32x32xf32> + } +} + """ + module_out = ir.Module.parse(mlir_str_4D4D_out) + + # module_in entry(%arg0: tensor<64x512xf32>, %arg1: tensor<512x256xf32>, %arg2: tensor<256xf32>, %arg3: tensor<256x1024xf32>, %arg4: tensor<1024xf32>) -> tensor<64x1024xf32> + torch_arg0 = torch.rand((64, 512), dtype=torch.float32) + torch_arg1 = torch.rand((512, 256), dtype=torch.float32) + torch_arg2 = torch.rand((256), dtype=torch.float32) + torch_arg3 = torch.rand((256, 1024), dtype=torch.float32) + torch_arg4 = torch.rand((1024), dtype=torch.float32) + + ref_res = (torch_arg0 @ torch_arg1 + torch_arg2) @ torch_arg3 + torch_arg4 + + passes = "any(gc-cpu-pipeline)" + compiler = GraphCompiler(passes) + ctx.enable_multithreading(False) + + arg0 = torch_arg0.contiguous().numpy() + arg1 = torch_arg1.contiguous().numpy() + arg2 = torch_arg2.contiguous().numpy() + arg3 = torch_arg3.contiguous().numpy() + arg4 = torch_arg4.contiguous().numpy() + gc_res = np.zeros((64, 1024), dtype=np.float32) + + entry = "entry" + mlir_args = get_mlir_args(module_in, entry, [arg0, arg1, arg2, arg3, arg4, gc_res]) + engine_in = compiler.compile_and_jit(module_in, ir_printing=False) + engine_in.invoke(entry, *mlir_args) + + print("Reference vs GC input IR close: ", np.allclose(gc_res, ref_res.to(torch.float32).numpy(), rtol=1e-5, atol=1e-5)) + assert_allclose(gc_res, ref_res.to(torch.float32).numpy(), rtol=1e-5, atol=1e-5) + + + # module_out entry(%arg0: tensor<64x512xf32>, %arg1: tensor<8x16x32x32xf32>, %arg2: tensor<8x32xf32>, %arg3: tensor<32x8x32x32xf32>, %arg4: tensor<32x32xf32>) -> tensor<64x1024xf32> + # module_out runtime_fold(%arg0: tensor<512x256xf32>, %arg1: tensor<256xf32>, %arg2: tensor<256x1024xf32>, %arg3: tensor<1024xf32>) -> (tensor<8x16x32x32xf32>, tensor<8x32xf32>, tensor<32x8x32x32xf32>, tensor<32x32xf32>) + fold_arg0 = arg1 + fold_arg1 = arg2 + fold_arg2 = arg3 + fold_arg3 = arg4 + fold_res0 = np.zeros((8, 16, 32, 32), dtype=np.float32) + fold_res1 = np.zeros((8, 32), dtype=np.float32) + fold_res2 = np.zeros((32, 8, 32, 32), dtype=np.float32) + fold_res3 = np.zeros((32, 32), dtype=np.float32) + + runtime_fold = "runtime_fold" + fold_mlir_args = get_mlir_args(module_out, runtime_fold, [fold_arg0, fold_arg1, fold_arg2, fold_arg3, fold_res0, fold_res1, fold_res2, fold_res3]) + + gc_res_out = np.zeros((64, 1024), dtype=np.float32) + entry = "entry" + entry_mlir_args = get_mlir_args(module_out, entry, [arg0, fold_res0, fold_res1, fold_res2, fold_res3, gc_res_out]) + + engine_out = compiler.compile_and_jit(module_out, ir_printing=False) + engine_out.invoke(runtime_fold, *fold_mlir_args) + engine_out.invoke(entry, *entry_mlir_args) + + print("GC input IR vs GC output IR close: ", np.allclose(gc_res, gc_res_out, rtol=1e-5, atol=1e-5)) + assert_allclose(gc_res, gc_res_out, rtol=1e-5, atol=1e-5) From f9c24256b1605dcf8b734f2f3976c3929e00f1cd Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Mon, 19 Aug 2024 18:23:38 -0700 Subject: [PATCH 22/29] Support MemRef args --- lib/gc/Transforms/ConstantTensorFolding.cpp | 31 +++++++++++++-------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp index d7174ec6e..9b1aa27cb 100644 --- a/lib/gc/Transforms/ConstantTensorFolding.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -17,6 +17,7 @@ #include "mlir/Dialect/Arith/IR/Arith.h" #include "mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h" +#include "mlir/Dialect/Bufferization/IR/Bufferization.h" #include "mlir/Dialect/Bufferization/Transforms/Bufferize.h" #include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h" #include "mlir/Dialect/Func/IR/FuncOps.h" @@ -53,6 +54,8 @@ bool isInConstantSubgraph(Operation *op) { auto opNamespace = op->getDialect()->getNamespace(); if (opNamespace == linalg::LinalgDialect::getDialectNamespace() || opNamespace == tensor::TensorDialect::getDialectNamespace() || + opNamespace == + bufferization::BufferizationDialect::getDialectNamespace() || opNamespace == arith::ArithDialect::getDialectNamespace()) { if (op->getAttr("onednn_graph.in_const_subgraph")) { return true; @@ -61,7 +64,7 @@ bool isInConstantSubgraph(Operation *op) { return false; } -int64_t getTensorSize(TensorType t) { +template int64_t getDataSize(T t) { Type eleType = t.getElementType(); unsigned bitWidth = eleType.getIntOrFloatBitWidth() / 8; // bytes ArrayRef shape = t.getShape(); @@ -72,6 +75,16 @@ int64_t getTensorSize(TensorType t) { return size; } +int64_t getValueSize(Value v) { + if (isa(v.getType())) { + auto t = dyn_cast(v.getType()); + return getDataSize(t); + } else { + auto t = dyn_cast(v.getType()); + return getDataSize(t); + } +} + /// @brief op has only one operand, or operands of op are one same value, or /// operands of op are one same value or from tensor.EmptyOp. /// @param op @@ -465,7 +478,7 @@ void getInputsAndOutputs(Block &block, // The constant ops are all single-input single-output. bool simpleTopo = true; auto arg = block.getArgument(id); - if (!isa(arg.getType())) { + if (!isa(arg.getType()) && !isa(arg.getType())) { continue; } inputTypes.push_back(arg.getType()); @@ -511,15 +524,12 @@ void getInputsAndOutputs(Block &block, // not fold it. Compare data size changes during traverse to find the last // op that satisfies this condition. if (simpleTopo) { - int64_t initSize = - getTensorSize(dyn_cast(valuesOnTheWay[0].getType())); - if (!isa(outputTypes.back()) || - initSize * DATA_SIZE_EXPANDING_THRESHOLD < - getTensorSize(dyn_cast(outputTypes.back()))) { + int64_t initSize = getValueSize(valuesOnTheWay[0]); + if (initSize * DATA_SIZE_EXPANDING_THRESHOLD < + getValueSize(valuesOnTheWay.back())) { size_t lastIdx = 0; for (size_t i = 1; i < valuesOnTheWay.size(); ++i) { - int64_t size = getTensorSize( - dyn_cast(valuesOnTheWay[i].getType())); + int64_t size = getValueSize(valuesOnTheWay[i]); if (initSize * DATA_SIZE_EXPANDING_THRESHOLD > size) { lastIdx = i; } @@ -574,8 +584,7 @@ func::FuncOp buildFoldFunc(MLIRContext *context, OpBuilder &builder, for (Value &tensor : outputValuesInFold) { LLVM_DEBUG(llvm::dbgs() << "Allocate buffer for tensor: " << tensor << "\n"); - buffersSize.push_back( - getTensorSize(dyn_cast(tensor.getType()))); + buffersSize.push_back(getValueSize(tensor)); } auto manager = ConstGraphTensorCacheManager::get(); SmallVector globalIndexes; From d8d2d7998dcc71e29db3a414c953bd87cd847f92 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Mon, 19 Aug 2024 18:24:10 -0700 Subject: [PATCH 23/29] Add to pipeline --- lib/gc/Transforms/Pipeline.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/gc/Transforms/Pipeline.cpp b/lib/gc/Transforms/Pipeline.cpp index 74da09bf4..c0ebfb175 100644 --- a/lib/gc/Transforms/Pipeline.cpp +++ b/lib/gc/Transforms/Pipeline.cpp @@ -51,6 +51,8 @@ void populateTensorPasses(mlir::OpPassManager &pm) { // todo: padding propagation pass // todo: layout propagation pass // todo: tensor constant propagation pass + pm.addPass(createConstantSubgraphAnalysisPass()); + pm.addPass(createConstantTensorFoldingPass()); // linalg.matmul lowering to (scf.loop + linalg.brgemm) pass pm.addNestedPass(createDeepTileContractionNamedOp()); From 22c4474dac1302e0c2696f1d16fb54cb4e36d817 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Mon, 26 Aug 2024 00:03:07 -0700 Subject: [PATCH 24/29] Forbid buffer_to_tensor case --- lib/gc/Transforms/ConstantTensorFolding.cpp | 72 ++++++++++++------- .../unittests/ExecutionEngine/JitWrapper.cpp | 1 - 2 files changed, 47 insertions(+), 26 deletions(-) diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp index 9b1aa27cb..f1d85e62e 100644 --- a/lib/gc/Transforms/ConstantTensorFolding.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -9,8 +9,8 @@ // This transformation pass performs a constant subgraph transform in MLIR. // //===----------------------------------------------------------------------===// - #include +#include #include #include "mlir/Transforms/Passes.h" @@ -496,6 +496,14 @@ void getInputsAndOutputs(Block &block, [](Operation *child) { return !isInConstantSubgraph(child); })) { + // skip case: memref v -> bufferization.to_tensor -> tensor t. + if (valuesOnTheWay.size() == 2 && v.hasOneUse() && + isa(v.getDefiningOp())) { + inputTypes.pop_back(); + inputValues.pop_back(); + constArgsIndexes.erase(id); + continue; + } if (std::find(outputValues.begin(), outputValues.end(), v) == outputValues.end()) { outputTypes.push_back(v.getType()); @@ -777,13 +785,17 @@ void ConstantTensorFolding::runOnOperation() { getInputsAndOutputs(block, compiletimeConstArgsIndexes, compiletimeInputTypes, compiletimeInputValues, compiletimeOutputTypes, compiletimeOutputValues); + assert(compiletimeInputTypes.size() == compiletimeInputValues.size()); + assert(compiletimeOutputTypes.size() == compiletimeOutputValues.size()); - func::FuncOp compiletimeFoldFunc = - buildFoldFunc(context, builder, topOp, "compiletime_fold", constOps, - compiletimeInputTypes, compiletimeInputValues, - compiletimeOutputTypes, compiletimeOutputValues); - (void)compiletimeFoldFunc; - canonicalizeAndClean(context, compiletimeFoldFunc.getOperation()); + if (!compiletimeOutputTypes.empty()) { + func::FuncOp compiletimeFoldFunc = + buildFoldFunc(context, builder, topOp, "compiletime_fold", constOps, + compiletimeInputTypes, compiletimeInputValues, + compiletimeOutputTypes, compiletimeOutputValues); + (void)compiletimeFoldFunc; + canonicalizeAndClean(context, compiletimeFoldFunc.getOperation()); + } // ===== build runtime folding function ===== SmallVector runtimeInputTypes; // types of constant tensors @@ -795,12 +807,16 @@ void ConstantTensorFolding::runOnOperation() { getInputsAndOutputs(block, runtimeConstArgsIndexes, runtimeInputTypes, runtimeInputValues, runtimeOutputTypes, runtimeOutputValues); - - func::FuncOp runtimeFoldFunc = buildFoldFunc( - context, builder, topOp, "runtime_fold", constOps, runtimeInputTypes, - runtimeInputValues, runtimeOutputTypes, runtimeOutputValues); - (void)runtimeFoldFunc; - canonicalizeAndClean(context, runtimeFoldFunc.getOperation()); + assert(runtimeInputTypes.size() == runtimeInputValues.size()); + assert(runtimeOutputTypes.size() == runtimeOutputValues.size()); + + if (!runtimeOutputTypes.empty()) { + func::FuncOp runtimeFoldFunc = buildFoldFunc( + context, builder, topOp, "runtime_fold", constOps, runtimeInputTypes, + runtimeInputValues, runtimeOutputTypes, runtimeOutputValues); + (void)runtimeFoldFunc; + canonicalizeAndClean(context, runtimeFoldFunc.getOperation()); + } // ===== build computing function ===== std::unordered_set constArgsIndexes = compiletimeConstArgsIndexes; @@ -811,8 +827,10 @@ void ConstantTensorFolding::runOnOperation() { SmallVector outputValues = compiletimeOutputValues; outputValues.insert(outputValues.end(), runtimeOutputValues.begin(), runtimeOutputValues.end()); - modifyComputeFunc(context, builder, topOp, topFunc, block, constArgsIndexes, - outputTypes, outputValues); + if (!outputTypes.empty()) { + modifyComputeFunc(context, builder, topOp, topFunc, block, + constArgsIndexes, outputTypes, outputValues); + } } else { std::unordered_set constArgsIndexes = compiletimeConstArgsIndexes; constArgsIndexes.merge(runtimeConstArgsIndexes); @@ -827,16 +845,20 @@ void ConstantTensorFolding::runOnOperation() { getArithConstantOutputs(block, outputTypes, outputValues); getInputsAndOutputs(block, constArgsIndexes, inputTypes, inputValues, outputTypes, outputValues); - - func::FuncOp foldFunc = - buildFoldFunc(context, builder, topOp, "runtime_fold", constOps, - inputTypes, inputValues, outputTypes, outputValues); - (void)foldFunc; - canonicalizeAndClean(context, foldFunc.getOperation()); - - // ===== build computing function ===== - modifyComputeFunc(context, builder, topOp, topFunc, block, constArgsIndexes, - outputTypes, outputValues); + assert(inputTypes.size() == inputValues.size()); + assert(outputTypes.size() == outputValues.size()); + + if (!outputTypes.empty()) { + func::FuncOp foldFunc = + buildFoldFunc(context, builder, topOp, "runtime_fold", constOps, + inputTypes, inputValues, outputTypes, outputValues); + (void)foldFunc; + canonicalizeAndClean(context, foldFunc.getOperation()); + + // ===== build computing function ===== + modifyComputeFunc(context, builder, topOp, topFunc, block, + constArgsIndexes, outputTypes, outputValues); + } } canonicalizeAndClean(context, topOp); diff --git a/test/mlir/unittests/ExecutionEngine/JitWrapper.cpp b/test/mlir/unittests/ExecutionEngine/JitWrapper.cpp index f7b93eaa6..48b27975e 100644 --- a/test/mlir/unittests/ExecutionEngine/JitWrapper.cpp +++ b/test/mlir/unittests/ExecutionEngine/JitWrapper.cpp @@ -25,7 +25,6 @@ using namespace mlir; static const char code1[] = R"mlir( module { -llvm.mlir.global constant @__num_orig_num_args(3 : i32) : i32 func.func @compute(%a: tensor<128xf32>, %b: tensor<128xf32>) -> tensor<128xf32> attributes { llvm.emit_c_interface } { %out = tensor.empty() : tensor<128xf32> %2 = linalg.add ins(%a, %b : tensor<128xf32>,tensor<128xf32>) outs(%out : tensor<128xf32>) -> tensor<128xf32> From e20d059ef539a8e256990fe0396a45da27e78a45 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Fri, 6 Sep 2024 11:14:31 +0800 Subject: [PATCH 25/29] Add shape info to global --- lib/gc/Transforms/ConstantTensorFolding.cpp | 38 ++++++++++++++++--- .../test_constant_tensor_folding-1.mlir | 2 +- .../test_constant_tensor_folding.mlir | 2 +- ...constant_tensor_folding_bf16_two_layers.py | 2 +- ..._constant_tensor_folding_f32_two_layers.py | 2 +- 5 files changed, 37 insertions(+), 9 deletions(-) diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp index f1d85e62e..17270d54f 100644 --- a/lib/gc/Transforms/ConstantTensorFolding.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -602,7 +602,7 @@ func::FuncOp buildFoldFunc(MLIRContext *context, OpBuilder &builder, globalIndexes.insert(globalIndexes.begin(), globalIndexes.size()); auto moduleOp = dyn_cast(topOp); addGlobalI64Array(moduleOp, moduleOp.getLoc(), builder, - "__" + name + "_buffer_ids_", globalIndexes); + "__" + name + "_buffer_ids", globalIndexes); auto returnOp = builder.create(topOp->getLoc(), outputValuesInFold); @@ -615,6 +615,24 @@ func::FuncOp buildFoldFunc(MLIRContext *context, OpBuilder &builder, }); } + // the ranks of folded results. + SmallVector foldRanks; + // the shapes of folded results. + SmallVector foldShapes; + for (Value &tensor : outputValuesInFold) { + auto t = dyn_cast(tensor.getType()); + Type eleType = t.getElementType(); + int64_t bitWidth = eleType.getIntOrFloatBitWidth() / 8; // bytes + ArrayRef shape = t.getShape(); + foldRanks.push_back(shape.size()); + foldShapes.insert(foldShapes.end(), shape.begin(), shape.end()); + foldShapes.push_back(bitWidth); + } + addGlobalI32Array(moduleOp, moduleOp.getLoc(), builder, "__folded_ranks", + foldRanks); + addGlobalI64Array(moduleOp, moduleOp.getLoc(), builder, "__folded_shapes", + foldShapes); + foldFunc.setVisibility(SymbolTable::Visibility::Public); foldFunc->setAttr(LLVM::LLVMDialect::getEmitCWrapperAttrName(), UnitAttr::get(context)); @@ -631,11 +649,13 @@ void modifyComputeFunc(MLIRContext *context, OpBuilder &builder, std::unordered_set &constArgsIndexes, SmallVector &outputTypes, SmallVector &outputValues) { - // the indexes of args to the folding func. + // the indexes of args to the folding func, including to-fold tensors and + // folded results. SmallVector foldArgs; - // the indexes of folded args. + // the indexes of folded results. SmallVector foldIds; - // the indexes of args to the computing func. + // the indexes of args to the computing func, including non-fold tensors and + // folded results. SmallVector computeArgs; // modify the BlockArguments of block @@ -715,7 +735,7 @@ void modifyComputeFunc(MLIRContext *context, OpBuilder &builder, addGlobalI32Array(moduleOp, moduleOp.getLoc(), builder, "__compute_args", computeArgs); - addGlobalI32(moduleOp, moduleOp.getLoc(), builder, "__num_orig_num_args", + addGlobalI32(moduleOp, moduleOp.getLoc(), builder, "__num_orig_args", oriNumArgs); } @@ -740,6 +760,14 @@ void canonicalizeAndClean(MLIRContext *context, Operation *topOp) { op->removeAttr("onednn_graph.in_const_subgraph"); } }); + topOp->walk([&](func::FuncOp op) { + if (op.getOperation()->getAttr("compiletime_const_args_index")) { + op.getOperation()->removeAttr("compiletime_const_args_index"); + } + if (op.getOperation()->getAttr("runtime_const_args_index")) { + op.getOperation()->removeAttr("runtime_const_args_index"); + } + }); } // Operate on tensors. Create fold() and compute() on module. The diff --git a/test/gc/Transforms/test_constant_tensor_folding-1.mlir b/test/gc/Transforms/test_constant_tensor_folding-1.mlir index 0664edafb..cdb5d1397 100644 --- a/test/gc/Transforms/test_constant_tensor_folding-1.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding-1.mlir @@ -32,7 +32,7 @@ module { // COM: expected output: // COM: module { -// COM: llvm.mlir.global external constant @__num_orig_num_args(3 : i32) {addr_space = 0 : i32} : i32 +// COM: llvm.mlir.global external constant @__num_orig_args(3 : i32) {addr_space = 0 : i32} : i32 // COM: llvm.mlir.global external constant @__compute_args(dense<[3, 2, 3, 4]> : tensor<4xi32>) {addr_space = 0 : i32} : !llvm.array<4 x i32> // COM: llvm.mlir.global external constant @__fold_args(dense<[4, 0, 1, 3, 4]> : tensor<5xi32>) {addr_space = 0 : i32} : !llvm.array<5 x i32> // COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[2, 0, 1]> : tensor<3xi64>) {addr_space = 0 : i32} : !llvm.array<3 x i64> diff --git a/test/gc/Transforms/test_constant_tensor_folding.mlir b/test/gc/Transforms/test_constant_tensor_folding.mlir index 71f475c00..59fe90236 100644 --- a/test/gc/Transforms/test_constant_tensor_folding.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding.mlir @@ -74,7 +74,7 @@ module { // COM: expected output: // COM: module { -// COM: llvm.mlir.global external constant @__num_orig_num_args(5 : i32) {addr_space = 0 : i32} : i32 +// COM: llvm.mlir.global external constant @__num_orig_args(5 : i32) {addr_space = 0 : i32} : i32 // COM: llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32> // COM: llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32> // COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64> diff --git a/test/gc/Transforms/test_constant_tensor_folding_bf16_two_layers.py b/test/gc/Transforms/test_constant_tensor_folding_bf16_two_layers.py index d444416e7..4e66b1ebf 100644 --- a/test/gc/Transforms/test_constant_tensor_folding_bf16_two_layers.py +++ b/test/gc/Transforms/test_constant_tensor_folding_bf16_two_layers.py @@ -141,7 +141,7 @@ #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #map4 = affine_map<(d0, d1, d2, d3) -> (d1, d3)> module { - llvm.mlir.global external constant @__num_orig_num_args(5 : i32) {addr_space = 0 : i32} : i32 + llvm.mlir.global external constant @__num_orig_args(5 : i32) {addr_space = 0 : i32} : i32 llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32> llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32> llvm.mlir.global external constant @__runtime_fold_buffer_ids_(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64> diff --git a/test/gc/Transforms/test_constant_tensor_folding_f32_two_layers.py b/test/gc/Transforms/test_constant_tensor_folding_f32_two_layers.py index 377e28a36..e05e2ac15 100644 --- a/test/gc/Transforms/test_constant_tensor_folding_f32_two_layers.py +++ b/test/gc/Transforms/test_constant_tensor_folding_f32_two_layers.py @@ -111,7 +111,7 @@ #map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> #map4 = affine_map<(d0, d1, d2, d3) -> (d1, d3)> module { - llvm.mlir.global external constant @__num_orig_num_args(5 : i32) {addr_space = 0 : i32} : i32 + llvm.mlir.global external constant @__num_orig_args(5 : i32) {addr_space = 0 : i32} : i32 llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32> llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32> llvm.mlir.global external constant @__runtime_fold_buffer_ids_(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64> From edbb708155112a9e3e42577789dc2ee9351434ad Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Sat, 14 Sep 2024 10:42:33 +0800 Subject: [PATCH 26/29] Clean tests. --- ...ir => test_constant_tensor_folding-0.mlir} | 2 + .../test_constant_tensor_folding-1.mlir | 130 +++++---- .../test_constant_tensor_folding.mlir | 82 ------ .../test_constant_tensor_folding_bf16_4D5D.py | 101 ------- ...constant_tensor_folding_bf16_two_layers.py | 258 ------------------ .../test_constant_tensor_folding_f32_4D4D.py | 96 ------- ..._constant_tensor_folding_f32_two_layers.py | 225 --------------- 7 files changed, 83 insertions(+), 811 deletions(-) rename test/gc/Transforms/{test_constant_tensor_folding-2.mlir => test_constant_tensor_folding-0.mlir} (99%) delete mode 100644 test/gc/Transforms/test_constant_tensor_folding.mlir delete mode 100644 test/gc/Transforms/test_constant_tensor_folding_bf16_4D5D.py delete mode 100644 test/gc/Transforms/test_constant_tensor_folding_bf16_two_layers.py delete mode 100644 test/gc/Transforms/test_constant_tensor_folding_f32_4D4D.py delete mode 100644 test/gc/Transforms/test_constant_tensor_folding_f32_two_layers.py diff --git a/test/gc/Transforms/test_constant_tensor_folding-2.mlir b/test/gc/Transforms/test_constant_tensor_folding-0.mlir similarity index 99% rename from test/gc/Transforms/test_constant_tensor_folding-2.mlir rename to test/gc/Transforms/test_constant_tensor_folding-0.mlir index a5e123085..eabdacc93 100644 --- a/test/gc/Transforms/test_constant_tensor_folding-2.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding-0.mlir @@ -1,5 +1,7 @@ // RUN: gc-opt --split-input-file -pass-pipeline="builtin.module(constant-subgraph-analysis,constant-tensor-folding)" %s | FileCheck %s +// COM:A complete example of compile-time and runtime folding. + // CHECK-LABEL: func.func @entry #map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d4, d6)> #map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2, d6 floordiv 2, d5, d3)> diff --git a/test/gc/Transforms/test_constant_tensor_folding-1.mlir b/test/gc/Transforms/test_constant_tensor_folding-1.mlir index cdb5d1397..92231703d 100644 --- a/test/gc/Transforms/test_constant_tensor_folding-1.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding-1.mlir @@ -1,59 +1,91 @@ // RUN: gc-opt --split-input-file -pass-pipeline="builtin.module(constant-subgraph-analysis,constant-tensor-folding)" %s | FileCheck %s +// COM: Test the 'postponeBroadcast' feature of constant tensor folding. + // CHECK-LABEL: func.func @entry +#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d4, d6)> +#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2, d6 floordiv 2, d5, d3)> +#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5)> +#map3 = affine_map<(d0, d1, d2, d3) -> (d1, d3)> +#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> module { - func.func @entry(%a: tensor<128xf32>, %b: tensor<128xf32>, %c: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes { llvm.emit_c_interface, runtime_const_args_index = [0 : i32, 1 : i32] } { - %c0 = arith.constant 0 : index - cpuruntime.printf "HI%zu\n" %c0 : index - %ax2 = tensor.empty() : tensor<128xf32> - %2 = linalg.add ins(%a, %a : tensor<128xf32>,tensor<128xf32>) outs(%ax2 : tensor<128xf32>) -> tensor<128xf32> - %bx2 = tensor.empty() : tensor<128xf32> - %3 = linalg.add ins(%b, %b : tensor<128xf32>,tensor<128xf32>) outs(%bx2 : tensor<128xf32>) -> tensor<128xf32> - %ax2pbx2 = tensor.empty() : tensor<128xf32> - %4 = linalg.add ins(%2, %3 : tensor<128xf32>,tensor<128xf32>) outs(%ax2pbx2 : tensor<128xf32>) -> tensor<128xf32> - %ax2mbx2 = tensor.empty() : tensor<128xf32> - %5 = linalg.mul ins(%2, %3 : tensor<128xf32>,tensor<128xf32>) outs(%ax2mbx2 : tensor<128xf32>) -> tensor<128xf32> - %ax2pbx2pc = tensor.empty() : tensor<128xf32> - %6 = linalg.add ins(%4, %c : tensor<128xf32>,tensor<128xf32>) outs(%ax2pbx2pc : tensor<128xf32>) -> tensor<128xf32> - %ax2mbx2mc = tensor.empty() : tensor<128xf32> - %7 = linalg.mul ins(%5, %c : tensor<128xf32>,tensor<128xf32>) outs(%ax2mbx2mc : tensor<128xf32>) -> tensor<128xf32> - return %6, %7 : tensor<128xf32>, tensor<128xf32> - } + // COM: A two-layer mlp. arg0: input feature. + // COM: arg1: weight of #1 linear. arg2: bias of #1 linear. + // COM: arg3: weight of #2 linear. arg4: bias of #2 linear. + func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, runtime_const_args_index = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { + %1 = tensor.empty() : tensor<2x16x32x32xbf16> + %packed_arg0 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %1 : tensor<64x512xbf16> -> tensor<2x16x32x32xbf16> + %2 = tensor.empty() : tensor<8x16x32x32xbf16> + %packed_arg1 = tensor.pack %arg1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %2 : tensor<512x256xbf16> -> tensor<8x16x32x32xbf16> + %3 = tensor.empty() : tensor<8x16x16x32x2xbf16> + %packed_packed_arg1 = tensor.pack %packed_arg1 inner_dims_pos = [2] inner_tiles = [2] into %3 : tensor<8x16x32x32xbf16> -> tensor<8x16x16x32x2xbf16> + %4 = tensor.empty() : tensor<2x8x32x32xbf16> + %cst_0 = arith.constant 0.000000e+00 : bf16 + %5 = linalg.fill ins(%cst_0 : bf16) outs(%4 : tensor<2x8x32x32xbf16>) -> tensor<2x8x32x32xbf16> + %6 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%packed_arg0, %packed_packed_arg1 : tensor<2x16x32x32xbf16>, tensor<8x16x16x32x2xbf16>) outs(%5 : tensor<2x8x32x32xbf16>) { + ^bb0(%in: bf16, %in_0: bf16, %out: bf16): + %44 = arith.mulf %in, %in_0 : bf16 + %55 = arith.addf %out, %44 : bf16 + linalg.yield %55 : bf16 + } -> tensor<2x8x32x32xbf16> + + // COM: Operations on %arg2: {pack, broadcast, extf, mul, truncf, bias_add} in entry(). + %15 = tensor.empty() : tensor<8x32xbf16> + %packed_arg2 = tensor.pack %arg2 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %15 : tensor<256xbf16> -> tensor<8x32xbf16> + %bc_arg2_init = tensor.empty() : tensor<2x8x32x32xbf16> + %bc_arg2 = linalg.broadcast ins(%packed_arg2 : tensor<8x32xbf16>) outs(%bc_arg2_init : tensor<2x8x32x32xbf16>) dimensions = [0, 2] + %extf32 = arith.extf %bc_arg2 : tensor<2x8x32x32xbf16> to tensor<2x8x32x32xf32> + %cst_2 = arith.constant 2.000000e+00 : f32 + %extf32_mul2_init = tensor.empty() : tensor<2x8x32x32xf32> + %extf32_mul2 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extf32 : tensor<2x8x32x32xf32>) outs(%extf32_mul2_init : tensor<2x8x32x32xf32>) { + ^bb0(%in: f32, %out: f32): + %8 = arith.mulf %in, %cst_2 : f32 + linalg.yield %8 : f32 + } -> tensor<2x8x32x32xf32> + %truncbf16 = arith.truncf %extf32_mul2 : tensor<2x8x32x32xf32> to tensor<2x8x32x32xbf16> + + %7 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%truncbf16 : tensor<2x8x32x32xbf16>) outs(%6 : tensor<2x8x32x32xbf16>) { + ^bb0(%in: bf16, %out: bf16): + %45 = arith.addf %in, %out : bf16 + linalg.yield %45 : bf16 + } -> tensor<2x8x32x32xbf16> + + %8 = tensor.empty() : tensor<32x8x32x32xbf16> + %packed_arg3 = tensor.pack %arg3 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x1024xbf16> -> tensor<32x8x32x32xbf16> + %9 = tensor.empty() : tensor<32x8x16x32x2xbf16> + %packed_packed_arg3 = tensor.pack %packed_arg3 inner_dims_pos = [2] inner_tiles = [2] into %9 : tensor<32x8x32x32xbf16> -> tensor<32x8x16x32x2xbf16> + %10 = tensor.empty() : tensor<2x32x32x32xbf16> + %11 = linalg.fill ins(%cst_0 : bf16) outs(%10 : tensor<2x32x32x32xbf16>) -> tensor<2x32x32x32xbf16> + %12 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%7, %packed_packed_arg3 : tensor<2x8x32x32xbf16>, tensor<32x8x16x32x2xbf16>) outs(%11 : tensor<2x32x32x32xbf16>) { + ^bb0(%in: bf16, %in_0: bf16, %out: bf16): + %46 = arith.mulf %in, %in_0 : bf16 + %56 = arith.addf %out, %46 : bf16 + linalg.yield %56 : bf16 + } -> tensor<2x32x32x32xbf16> + %16 = tensor.empty() : tensor<32x32xbf16> + %packed_arg4 = tensor.pack %arg4 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %16 : tensor<1024xbf16> -> tensor<32x32xbf16> + %13 = linalg.generic {indexing_maps = [#map3, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%packed_arg4 : tensor<32x32xbf16>) outs(%12 : tensor<2x32x32x32xbf16>) { + ^bb0(%in: bf16, %out: bf16): + %47 = arith.addf %in, %out : bf16 + linalg.yield %47 : bf16 + } -> tensor<2x32x32x32xbf16> + %14 = tensor.empty() : tensor<64x1024xbf16> + %unpack = tensor.unpack %13 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<2x32x32x32xbf16> -> tensor<64x1024xbf16> + return %unpack : tensor<64x1024xbf16> + } } -// CHECK: cpuruntime.printf -// CHECK: linalg.add -// CHECK: linalg.mul +// COM: After transform, operations on %arg2: {pack, extf, mul, truncf} in fold(), {broadcast, bias_add} in entry(). +// CHECK: linalg.broadcast // CHECK: func.func @runtime_fold -// CHECK: linalg.add -// CHECK: linalg.add -// CHECK: linalg.add -// CHECK: linalg.mul +// CHECK: arith.extf +// CHECK: arith.truncf // COM: expected output: // COM: module { -// COM: llvm.mlir.global external constant @__num_orig_args(3 : i32) {addr_space = 0 : i32} : i32 -// COM: llvm.mlir.global external constant @__compute_args(dense<[3, 2, 3, 4]> : tensor<4xi32>) {addr_space = 0 : i32} : !llvm.array<4 x i32> -// COM: llvm.mlir.global external constant @__fold_args(dense<[4, 0, 1, 3, 4]> : tensor<5xi32>) {addr_space = 0 : i32} : !llvm.array<5 x i32> -// COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[2, 0, 1]> : tensor<3xi64>) {addr_space = 0 : i32} : !llvm.array<3 x i64> -// COM: func.func @entry(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>, %arg2: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes {llvm.emit_c_interface, runtime_const_args_index = [0 : i32, 1 : i32]} { -// COM: %c0 = arith.constant 0 : index -// COM: cpuruntime.printf "HI%zu\0A" %c0 : index -// COM: %0 = tensor.empty() : tensor<128xf32> -// COM: %1 = linalg.add ins(%arg2, %arg0 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) -> tensor<128xf32> -// COM: %2 = tensor.empty() : tensor<128xf32> -// COM: %3 = linalg.mul ins(%arg1, %arg0 : tensor<128xf32>, tensor<128xf32>) outs(%2 : tensor<128xf32>) -> tensor<128xf32> -// COM: return %1, %3 : tensor<128xf32>, tensor<128xf32> -// COM: } -// COM: func.func @fold(%arg0: tensor<128xf32>, %arg1: tensor<128xf32>) -> (tensor<128xf32>, tensor<128xf32>) attributes {llvm.emit_c_interface} { -// COM: %0 = tensor.empty() : tensor<128xf32> -// COM: %1 = linalg.add ins(%arg0, %arg0 : tensor<128xf32>, tensor<128xf32>) outs(%0 : tensor<128xf32>) -> tensor<128xf32> -// COM: %2 = tensor.empty() : tensor<128xf32> -// COM: %3 = linalg.add ins(%arg1, %arg1 : tensor<128xf32>, tensor<128xf32>) outs(%2 : tensor<128xf32>) -> tensor<128xf32> -// COM: %4 = tensor.empty() : tensor<128xf32> -// COM: %5 = linalg.add ins(%1, %3 : tensor<128xf32>, tensor<128xf32>) outs(%4 : tensor<128xf32>) -> tensor<128xf32> -// COM: %6 = tensor.empty() : tensor<128xf32> -// COM: %7 = linalg.mul ins(%1, %3 : tensor<128xf32>, tensor<128xf32>) outs(%6 : tensor<128xf32>) -> tensor<128xf32> -// COM: return %7, %5 : tensor<128xf32>, tensor<128xf32> -// COM: } -// COM: } \ No newline at end of file +// COM: llvm.mlir.global external constant @__num_orig_args(5 : i32) {addr_space = 0 : i32} : i32 +// COM: llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32> +// COM: llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32> +// COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64> +// COM: func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, runtime_const_args_index = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} +// COM: func.func @fold(%arg0: tensor<512x256xbf16>, %arg1: tensor<256xbf16>, %arg2: tensor<256x1024xbf16>, %arg3: tensor<1024xbf16>) -> (tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16>) attributes {llvm.emit_c_interface} diff --git a/test/gc/Transforms/test_constant_tensor_folding.mlir b/test/gc/Transforms/test_constant_tensor_folding.mlir deleted file mode 100644 index 59fe90236..000000000 --- a/test/gc/Transforms/test_constant_tensor_folding.mlir +++ /dev/null @@ -1,82 +0,0 @@ -// RUN: gc-opt --split-input-file -pass-pipeline="builtin.module(constant-subgraph-analysis,constant-tensor-folding)" %s | FileCheck %s - -// CHECK-LABEL: func.func @entry -#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d4, d6)> -#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2, d6 floordiv 2, d5, d3)> -#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5)> -#map3 = affine_map<(d0, d1, d2, d3) -> (d1, d3)> -#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -module { - // COM: A two-layer mlp. arg0: input feature. arg1: weight of #1 linear. arg2: bias of #1 linear. - // COM: arg3: weight of #2 linear. arg4: bias of #2 linear. - func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, runtime_const_args_index = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { - %1 = tensor.empty() : tensor<2x16x32x32xbf16> - %packed_arg0 = tensor.pack %arg0 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %1 : tensor<64x512xbf16> -> tensor<2x16x32x32xbf16> - %2 = tensor.empty() : tensor<8x16x32x32xbf16> - %packed_arg1 = tensor.pack %arg1 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %2 : tensor<512x256xbf16> -> tensor<8x16x32x32xbf16> - %3 = tensor.empty() : tensor<8x16x16x32x2xbf16> - %packed_packed_arg1 = tensor.pack %packed_arg1 inner_dims_pos = [2] inner_tiles = [2] into %3 : tensor<8x16x32x32xbf16> -> tensor<8x16x16x32x2xbf16> - %4 = tensor.empty() : tensor<2x8x32x32xbf16> - %cst_0 = arith.constant 0.000000e+00 : bf16 - %5 = linalg.fill ins(%cst_0 : bf16) outs(%4 : tensor<2x8x32x32xbf16>) -> tensor<2x8x32x32xbf16> - %6 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%packed_arg0, %packed_packed_arg1 : tensor<2x16x32x32xbf16>, tensor<8x16x16x32x2xbf16>) outs(%5 : tensor<2x8x32x32xbf16>) { - ^bb0(%in: bf16, %in_0: bf16, %out: bf16): - %44 = arith.mulf %in, %in_0 : bf16 - %55 = arith.addf %out, %44 : bf16 - linalg.yield %55 : bf16 - } -> tensor<2x8x32x32xbf16> - %15 = tensor.empty() : tensor<8x32xbf16> - %packed_arg2 = tensor.pack %arg2 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %15 : tensor<256xbf16> -> tensor<8x32xbf16> - %bc_arg2_init = tensor.empty() : tensor<2x8x32x32xbf16> - %bc_arg2 = linalg.broadcast ins(%packed_arg2 : tensor<8x32xbf16>) outs(%bc_arg2_init : tensor<2x8x32x32xbf16>) dimensions = [0, 2] - %extf32 = arith.extf %bc_arg2 : tensor<2x8x32x32xbf16> to tensor<2x8x32x32xf32> - %cst_2 = arith.constant 2.000000e+00 : f32 - %extf32_mul2_init = tensor.empty() : tensor<2x8x32x32xf32> - %extf32_mul2 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extf32 : tensor<2x8x32x32xf32>) outs(%extf32_mul2_init : tensor<2x8x32x32xf32>) { - ^bb0(%in: f32, %out: f32): - %8 = arith.mulf %in, %cst_2 : f32 - linalg.yield %8 : f32 - } -> tensor<2x8x32x32xf32> - %truncbf16 = arith.truncf %extf32_mul2 : tensor<2x8x32x32xf32> to tensor<2x8x32x32xbf16> - %7 = linalg.generic {indexing_maps = [#map4, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%truncbf16 : tensor<2x8x32x32xbf16>) outs(%6 : tensor<2x8x32x32xbf16>) { - ^bb0(%in: bf16, %out: bf16): - %45 = arith.addf %in, %out : bf16 - linalg.yield %45 : bf16 - } -> tensor<2x8x32x32xbf16> - %8 = tensor.empty() : tensor<32x8x32x32xbf16> - %packed_arg3 = tensor.pack %arg3 outer_dims_perm = [1, 0] inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %8 : tensor<256x1024xbf16> -> tensor<32x8x32x32xbf16> - %9 = tensor.empty() : tensor<32x8x16x32x2xbf16> - %packed_packed_arg3 = tensor.pack %packed_arg3 inner_dims_pos = [2] inner_tiles = [2] into %9 : tensor<32x8x32x32xbf16> -> tensor<32x8x16x32x2xbf16> - %10 = tensor.empty() : tensor<2x32x32x32xbf16> - %11 = linalg.fill ins(%cst_0 : bf16) outs(%10 : tensor<2x32x32x32xbf16>) -> tensor<2x32x32x32xbf16> - %12 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%7, %packed_packed_arg3 : tensor<2x8x32x32xbf16>, tensor<32x8x16x32x2xbf16>) outs(%11 : tensor<2x32x32x32xbf16>) { - ^bb0(%in: bf16, %in_0: bf16, %out: bf16): - %46 = arith.mulf %in, %in_0 : bf16 - %56 = arith.addf %out, %46 : bf16 - linalg.yield %56 : bf16 - } -> tensor<2x32x32x32xbf16> - %16 = tensor.empty() : tensor<32x32xbf16> - %packed_arg4 = tensor.pack %arg4 outer_dims_perm = [0] inner_dims_pos = [0] inner_tiles = [32] into %16 : tensor<1024xbf16> -> tensor<32x32xbf16> - %13 = linalg.generic {indexing_maps = [#map3, #map4], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%packed_arg4 : tensor<32x32xbf16>) outs(%12 : tensor<2x32x32x32xbf16>) { - ^bb0(%in: bf16, %out: bf16): - %47 = arith.addf %in, %out : bf16 - linalg.yield %47 : bf16 - } -> tensor<2x32x32x32xbf16> - %14 = tensor.empty() : tensor<64x1024xbf16> - %unpack = tensor.unpack %13 inner_dims_pos = [0, 1] inner_tiles = [32, 32] into %14 : tensor<2x32x32x32xbf16> -> tensor<64x1024xbf16> - return %unpack : tensor<64x1024xbf16> - } -} -// CHECK: linalg.broadcast -// CHECK: func.func @runtime_fold -// CHECK: arith.extf -// CHECK: arith.truncf - -// COM: expected output: -// COM: module { -// COM: llvm.mlir.global external constant @__num_orig_args(5 : i32) {addr_space = 0 : i32} : i32 -// COM: llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32> -// COM: llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32> -// COM: llvm.mlir.global external constant @__fold_buffer_ids(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64> -// COM: func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, runtime_const_args_index = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} -// COM: func.func @fold(%arg0: tensor<512x256xbf16>, %arg1: tensor<256xbf16>, %arg2: tensor<256x1024xbf16>, %arg3: tensor<1024xbf16>) -> (tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16>) attributes {llvm.emit_c_interface} diff --git a/test/gc/Transforms/test_constant_tensor_folding_bf16_4D5D.py b/test/gc/Transforms/test_constant_tensor_folding_bf16_4D5D.py deleted file mode 100644 index 0fafbd080..000000000 --- a/test/gc/Transforms/test_constant_tensor_folding_bf16_4D5D.py +++ /dev/null @@ -1,101 +0,0 @@ -################################################################################ -# Copyright (C) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions -# and limitations under the License. -# SPDX-License-Identifier: Apache-2.0 -################################################################################ - -from enum import Flag -import os -import sys -import ml_dtypes -import numpy as np -from gc_mlir import ir -from gc_mlir.graph_compiler import GraphCompiler -from numpy.testing import assert_allclose - -project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -if project_dir not in sys.path: - sys.path.insert(0, project_dir) - -import torch -# from bench import py_timeit_bench -from utils import get_mlir_args - -if __name__ == "__main__": - with ir.Context() as ctx: - ctx.allow_unregistered_dialects = True - - M = 64 - N = 256 - K = 512 - MBlock = 32 - NBlock = 32 - KBlock = 32 - vnni_size = 2 - shapeA = [M // MBlock, K // KBlock, MBlock, KBlock] - shapeB = [N // NBlock, K // KBlock, KBlock // vnni_size, NBlock, vnni_size] - shapeC = [M // MBlock, N // NBlock, MBlock, NBlock] - - block_start = "{" - block_end = "}" - mlir_str = f''' -#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d4, d6)> -#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2, d6 floordiv 2, d5, d3)> -#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5)> -module {block_start} - func.func @entry(%arg0: tensor<{M // MBlock}x{K // KBlock}x{MBlock}x{KBlock}xbf16>, %cst: tensor<{N // NBlock}x{K // KBlock}x{KBlock // vnni_size}x{NBlock}x{vnni_size}xbf16>) -> tensor<{M // MBlock}x{N // NBlock}x{MBlock}x{NBlock}xbf16> attributes {block_start}llvm.emit_c_interface{block_end} {block_start} - %cst_0 = arith.constant 0.000000e+00 : bf16 - %0 = tensor.empty() : tensor<{M // MBlock}x{N // NBlock}x{MBlock}x{NBlock}xbf16> - %1 = linalg.fill ins(%cst_0 : bf16) outs(%0 : tensor<{M // MBlock}x{N // NBlock}x{MBlock}x{NBlock}xbf16>) -> tensor<{M // MBlock}x{N // NBlock}x{MBlock}x{NBlock}xbf16> - %2 = linalg.generic {block_start}indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]{block_end} ins(%arg0, %cst : tensor<{M // MBlock}x{K // KBlock}x{MBlock}x{KBlock}xbf16>, tensor<{N // NBlock}x{K // KBlock}x{KBlock // vnni_size}x{NBlock}x{vnni_size}xbf16>) outs(%1 : tensor<{M // MBlock}x{N // NBlock}x{MBlock}x{NBlock}xbf16>) {block_start} - ^bb0(%in: bf16, %in_1: bf16, %out: bf16): - %3 = arith.mulf %in, %in_1 : bf16 - %4 = arith.addf %out, %3 : bf16 - linalg.yield %4 : bf16 - {block_end} -> tensor<{M // MBlock}x{N // NBlock}x{MBlock}x{NBlock}xbf16> - return %2 : tensor<{M // MBlock}x{N // NBlock}x{MBlock}x{NBlock}xbf16> - {block_end} -{block_end} - ''' - print(mlir_str) - - # 4D x 5D, inputs transposed - module_in = ir.Module.parse(mlir_str) - - # entry(%transposed: tensor<2x16x32x32xbf16>, %transposed_5: tensor<8x16x16x32x2xbf16>) -> tensor<2x8x32x32xbf16> - torch_arg0 = torch.rand((M, K), dtype=torch.bfloat16) - torch_arg1 = torch.rand((K, N), dtype=torch.bfloat16) - ref_res = torch_arg0 @ torch_arg1 - - passes = "any(gc-cpu-pipeline)" - shared_libs = [ - os.environ["MLIR_C_RUNNER_UTILS"], - os.environ["MLIR_RUNNER_UTILS"], - ] - compiler = GraphCompiler(passes) - ctx.enable_multithreading(False) - - arg0 = torch_arg0.view(shapeA).permute([0, 2, 1, 3]).contiguous() # MK -> MKmk - np_arg0 = arg0.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16) - arg1 = torch_arg1.view(shapeB).permute([3, 0, 1, 4, 2]).contiguous() # KN -> NKkn2k - np_arg1 = arg1.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16) - gc_res = np.ones(shapeC, dtype=ml_dtypes.bfloat16) - - entry = "entry" - mlir_args = get_mlir_args(module_in, entry, [np_arg0, np_arg1, gc_res]) - engine_in = compiler.compile_and_jit(module_in, ir_printing=False) - engine_in.invoke(entry, *mlir_args) - gc_res = np.reshape(np.transpose(gc_res, (0, 2, 1, 3)), (M, N)) # MNmn -> MN - - assert_allclose(gc_res.astype(np.float32), ref_res.to(torch.float32).numpy(), rtol=1e-5, atol=1e-5) diff --git a/test/gc/Transforms/test_constant_tensor_folding_bf16_two_layers.py b/test/gc/Transforms/test_constant_tensor_folding_bf16_two_layers.py deleted file mode 100644 index 4e66b1ebf..000000000 --- a/test/gc/Transforms/test_constant_tensor_folding_bf16_two_layers.py +++ /dev/null @@ -1,258 +0,0 @@ -################################################################################ -# Copyright (C) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions -# and limitations under the License. -# SPDX-License-Identifier: Apache-2.0 -################################################################################ - -import os -import sys - -import numpy as np -import ml_dtypes - -from gc_mlir import ir -from gc_mlir.graph_compiler import GraphCompiler -from numpy.testing import assert_allclose - -project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -if project_dir not in sys.path: - sys.path.insert(0, project_dir) - -import torch -# from bench import py_timeit_bench -from utils import get_mlir_args - -if __name__ == "__main__": - with ir.Context() as ctx: - ctx.allow_unregistered_dialects = True - # ctx.enable_multithreading = False - module_in = ir.Module.parse( - """ -#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d4, d6)> -#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2, d6 floordiv 2, d5, d3)> -#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5)> -#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map4 = affine_map<(d0, d1, d2, d3) -> (d1, d3)> -module { - func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, runtime_const_args_index = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { - %0 = tensor.empty() : tensor<2x16x32x32xbf16> - %cst = arith.constant 0.000000e+00 : bf16 - %padded = tensor.pad %arg0 low[0, 0] high[0, 0] { - ^bb0(%arg5: index, %arg6: index): - tensor.yield %cst : bf16 - } : tensor<64x512xbf16> to tensor<64x512xbf16> - %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [2, 32, 16, 32] : tensor<64x512xbf16> into tensor<2x32x16x32xbf16> - %transposed = linalg.transpose ins(%expanded : tensor<2x32x16x32xbf16>) outs(%0 : tensor<2x16x32x32xbf16>) permutation = [0, 2, 1, 3] - %1 = tensor.empty() : tensor<8x16x32x32xbf16> - %padded_0 = tensor.pad %arg1 low[0, 0] high[0, 0] { - ^bb0(%arg5: index, %arg6: index): - tensor.yield %cst : bf16 - } : tensor<512x256xbf16> to tensor<512x256xbf16> - %expanded_1 = tensor.expand_shape %padded_0 [[0, 1], [2, 3]] output_shape [16, 32, 8, 32] : tensor<512x256xbf16> into tensor<16x32x8x32xbf16> - %transposed_2 = linalg.transpose ins(%expanded_1 : tensor<16x32x8x32xbf16>) outs(%1 : tensor<8x16x32x32xbf16>) permutation = [2, 0, 1, 3] - %2 = tensor.empty() : tensor<8x16x16x32x2xbf16> - %padded_3 = tensor.pad %transposed_2 low[0, 0, 0, 0] high[0, 0, 0, 0] { - ^bb0(%arg5: index, %arg6: index, %arg7: index, %arg8: index): - tensor.yield %cst : bf16 - } : tensor<8x16x32x32xbf16> to tensor<8x16x32x32xbf16> - %expanded_4 = tensor.expand_shape %padded_3 [[0], [1], [2, 3], [4]] output_shape [8, 16, 16, 2, 32] : tensor<8x16x32x32xbf16> into tensor<8x16x16x2x32xbf16> - %transposed_5 = linalg.transpose ins(%expanded_4 : tensor<8x16x16x2x32xbf16>) outs(%2 : tensor<8x16x16x32x2xbf16>) permutation = [0, 1, 2, 4, 3] - %3 = tensor.empty() : tensor<2x8x32x32xbf16> - %4 = linalg.fill ins(%cst : bf16) outs(%3 : tensor<2x8x32x32xbf16>) -> tensor<2x8x32x32xbf16> - %5 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%transposed, %transposed_5 : tensor<2x16x32x32xbf16>, tensor<8x16x16x32x2xbf16>) outs(%4 : tensor<2x8x32x32xbf16>) { - ^bb0(%in: bf16, %in_19: bf16, %out: bf16): - %17 = arith.mulf %in, %in_19 : bf16 - %18 = arith.addf %out, %17 : bf16 - linalg.yield %18 : bf16 - } -> tensor<2x8x32x32xbf16> - %6 = tensor.empty() : tensor<8x32xbf16> - %padded_6 = tensor.pad %arg2 low[0] high[0] { - ^bb0(%arg5: index): - tensor.yield %cst : bf16 - } : tensor<256xbf16> to tensor<256xbf16> - %expanded_7 = tensor.expand_shape %padded_6 [[0, 1]] output_shape [8, 32] : tensor<256xbf16> into tensor<8x32xbf16> - %transposed_8 = linalg.transpose ins(%expanded_7 : tensor<8x32xbf16>) outs(%6 : tensor<8x32xbf16>) permutation = [0, 1] - %broadcasted = linalg.broadcast ins(%transposed_8 : tensor<8x32xbf16>) outs(%3 : tensor<2x8x32x32xbf16>) dimensions = [0, 2] - %7 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%broadcasted : tensor<2x8x32x32xbf16>) outs(%5 : tensor<2x8x32x32xbf16>) { - ^bb0(%in: bf16, %out: bf16): - %17 = arith.addf %in, %out : bf16 - linalg.yield %17 : bf16 - } -> tensor<2x8x32x32xbf16> - %8 = tensor.empty() : tensor<32x8x32x32xbf16> - %padded_9 = tensor.pad %arg3 low[0, 0] high[0, 0] { - ^bb0(%arg5: index, %arg6: index): - tensor.yield %cst : bf16 - } : tensor<256x1024xbf16> to tensor<256x1024xbf16> - %expanded_10 = tensor.expand_shape %padded_9 [[0, 1], [2, 3]] output_shape [8, 32, 32, 32] : tensor<256x1024xbf16> into tensor<8x32x32x32xbf16> - %transposed_11 = linalg.transpose ins(%expanded_10 : tensor<8x32x32x32xbf16>) outs(%8 : tensor<32x8x32x32xbf16>) permutation = [2, 0, 1, 3] - %9 = tensor.empty() : tensor<32x8x16x32x2xbf16> - %padded_12 = tensor.pad %transposed_11 low[0, 0, 0, 0] high[0, 0, 0, 0] { - ^bb0(%arg5: index, %arg6: index, %arg7: index, %arg8: index): - tensor.yield %cst : bf16 - } : tensor<32x8x32x32xbf16> to tensor<32x8x32x32xbf16> - %expanded_13 = tensor.expand_shape %padded_12 [[0], [1], [2, 3], [4]] output_shape [32, 8, 16, 2, 32] : tensor<32x8x32x32xbf16> into tensor<32x8x16x2x32xbf16> - %transposed_14 = linalg.transpose ins(%expanded_13 : tensor<32x8x16x2x32xbf16>) outs(%9 : tensor<32x8x16x32x2xbf16>) permutation = [0, 1, 2, 4, 3] - %10 = tensor.empty() : tensor<2x32x32x32xbf16> - %11 = linalg.fill ins(%cst : bf16) outs(%10 : tensor<2x32x32x32xbf16>) -> tensor<2x32x32x32xbf16> - %12 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%7, %transposed_14 : tensor<2x8x32x32xbf16>, tensor<32x8x16x32x2xbf16>) outs(%11 : tensor<2x32x32x32xbf16>) { - ^bb0(%in: bf16, %in_19: bf16, %out: bf16): - %17 = arith.mulf %in, %in_19 : bf16 - %18 = arith.addf %out, %17 : bf16 - linalg.yield %18 : bf16 - } -> tensor<2x32x32x32xbf16> - %13 = tensor.empty() : tensor<32x32xbf16> - %padded_15 = tensor.pad %arg4 low[0] high[0] { - ^bb0(%arg5: index): - tensor.yield %cst : bf16 - } : tensor<1024xbf16> to tensor<1024xbf16> - %expanded_16 = tensor.expand_shape %padded_15 [[0, 1]] output_shape [32, 32] : tensor<1024xbf16> into tensor<32x32xbf16> - %transposed_17 = linalg.transpose ins(%expanded_16 : tensor<32x32xbf16>) outs(%13 : tensor<32x32xbf16>) permutation = [0, 1] - %14 = linalg.generic {indexing_maps = [#map4, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_17 : tensor<32x32xbf16>) outs(%12 : tensor<2x32x32x32xbf16>) { - ^bb0(%in: bf16, %out: bf16): - %17 = arith.addf %in, %out : bf16 - linalg.yield %17 : bf16 - } -> tensor<2x32x32x32xbf16> - %15 = tensor.empty() : tensor<64x1024xbf16> - %transposed_18 = linalg.transpose ins(%14 : tensor<2x32x32x32xbf16>) outs(%10 : tensor<2x32x32x32xbf16>) permutation = [0, 2, 1, 3] - %collapsed = tensor.collapse_shape %transposed_18 [[0, 1], [2, 3]] : tensor<2x32x32x32xbf16> into tensor<64x1024xbf16> - %extracted_slice = tensor.extract_slice %collapsed[0, 0] [64, 1024] [1, 1] : tensor<64x1024xbf16> to tensor<64x1024xbf16> - %16 = linalg.copy ins(%extracted_slice : tensor<64x1024xbf16>) outs(%15 : tensor<64x1024xbf16>) -> tensor<64x1024xbf16> - return %16 : tensor<64x1024xbf16> - } -} - """ - ) - module_out = ir.Module.parse( - """ -#map = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d4, d6)> -#map1 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2, d6 floordiv 2, d5, d3)> -#map2 = affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5)> -#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map4 = affine_map<(d0, d1, d2, d3) -> (d1, d3)> -module { - llvm.mlir.global external constant @__num_orig_args(5 : i32) {addr_space = 0 : i32} : i32 - llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32> - llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32> - llvm.mlir.global external constant @__runtime_fold_buffer_ids_(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64> - func.func @entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> attributes {llvm.emit_c_interface, runtime_const_args_index = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { - %cst = arith.constant 0.000000e+00 : bf16 - %0 = tensor.empty() : tensor<2x16x32x32xbf16> - %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [2, 32, 16, 32] : tensor<64x512xbf16> into tensor<2x32x16x32xbf16> - %transposed = linalg.transpose ins(%expanded : tensor<2x32x16x32xbf16>) outs(%0 : tensor<2x16x32x32xbf16>) permutation = [0, 2, 1, 3] - %1 = tensor.empty() : tensor<2x8x32x32xbf16> - %2 = linalg.fill ins(%cst : bf16) outs(%1 : tensor<2x8x32x32xbf16>) -> tensor<2x8x32x32xbf16> - %3 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%transposed, %arg1 : tensor<2x16x32x32xbf16>, tensor<8x16x16x32x2xbf16>) outs(%2 : tensor<2x8x32x32xbf16>) { - ^bb0(%in: bf16, %in_1: bf16, %out: bf16): - %11 = arith.mulf %in, %in_1 : bf16 - %12 = arith.addf %out, %11 : bf16 - linalg.yield %12 : bf16 - } -> tensor<2x8x32x32xbf16> - %broadcasted = linalg.broadcast ins(%arg2 : tensor<8x32xbf16>) outs(%1 : tensor<2x8x32x32xbf16>) dimensions = [0, 2] - %4 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%broadcasted : tensor<2x8x32x32xbf16>) outs(%3 : tensor<2x8x32x32xbf16>) { - ^bb0(%in: bf16, %out: bf16): - %11 = arith.addf %in, %out : bf16 - linalg.yield %11 : bf16 - } -> tensor<2x8x32x32xbf16> - %5 = tensor.empty() : tensor<2x32x32x32xbf16> - %6 = linalg.fill ins(%cst : bf16) outs(%5 : tensor<2x32x32x32xbf16>) -> tensor<2x32x32x32xbf16> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%4, %arg3 : tensor<2x8x32x32xbf16>, tensor<32x8x16x32x2xbf16>) outs(%6 : tensor<2x32x32x32xbf16>) { - ^bb0(%in: bf16, %in_1: bf16, %out: bf16): - %11 = arith.mulf %in, %in_1 : bf16 - %12 = arith.addf %out, %11 : bf16 - linalg.yield %12 : bf16 - } -> tensor<2x32x32x32xbf16> - %8 = linalg.generic {indexing_maps = [#map4, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg4 : tensor<32x32xbf16>) outs(%7 : tensor<2x32x32x32xbf16>) { - ^bb0(%in: bf16, %out: bf16): - %11 = arith.addf %in, %out : bf16 - linalg.yield %11 : bf16 - } -> tensor<2x32x32x32xbf16> - %9 = tensor.empty() : tensor<64x1024xbf16> - %transposed_0 = linalg.transpose ins(%8 : tensor<2x32x32x32xbf16>) outs(%5 : tensor<2x32x32x32xbf16>) permutation = [0, 2, 1, 3] - %collapsed = tensor.collapse_shape %transposed_0 [[0, 1], [2, 3]] : tensor<2x32x32x32xbf16> into tensor<64x1024xbf16> - %10 = linalg.copy ins(%collapsed : tensor<64x1024xbf16>) outs(%9 : tensor<64x1024xbf16>) -> tensor<64x1024xbf16> - return %10 : tensor<64x1024xbf16> - } - func.func @runtime_fold(%arg0: tensor<512x256xbf16>, %arg1: tensor<256xbf16>, %arg2: tensor<256x1024xbf16>, %arg3: tensor<1024xbf16>) -> (tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16>) attributes {llvm.emit_c_interface} { - %0 = tensor.empty() : tensor<8x16x32x32xbf16> - %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [16, 32, 8, 32] : tensor<512x256xbf16> into tensor<16x32x8x32xbf16> - %transposed = linalg.transpose ins(%expanded : tensor<16x32x8x32xbf16>) outs(%0 : tensor<8x16x32x32xbf16>) permutation = [2, 0, 1, 3] - %1 = tensor.empty() : tensor<8x16x16x32x2xbf16> - %expanded_0 = tensor.expand_shape %transposed [[0], [1], [2, 3], [4]] output_shape [8, 16, 16, 2, 32] : tensor<8x16x32x32xbf16> into tensor<8x16x16x2x32xbf16> - %transposed_1 = linalg.transpose ins(%expanded_0 : tensor<8x16x16x2x32xbf16>) outs(%1 : tensor<8x16x16x32x2xbf16>) permutation = [0, 1, 2, 4, 3] - %expanded_2 = tensor.expand_shape %arg1 [[0, 1]] output_shape [8, 32] : tensor<256xbf16> into tensor<8x32xbf16> - %2 = tensor.empty() : tensor<32x8x32x32xbf16> - %expanded_3 = tensor.expand_shape %arg2 [[0, 1], [2, 3]] output_shape [8, 32, 32, 32] : tensor<256x1024xbf16> into tensor<8x32x32x32xbf16> - %transposed_4 = linalg.transpose ins(%expanded_3 : tensor<8x32x32x32xbf16>) outs(%2 : tensor<32x8x32x32xbf16>) permutation = [2, 0, 1, 3] - %3 = tensor.empty() : tensor<32x8x16x32x2xbf16> - %expanded_5 = tensor.expand_shape %transposed_4 [[0], [1], [2, 3], [4]] output_shape [32, 8, 16, 2, 32] : tensor<32x8x32x32xbf16> into tensor<32x8x16x2x32xbf16> - %transposed_6 = linalg.transpose ins(%expanded_5 : tensor<32x8x16x2x32xbf16>) outs(%3 : tensor<32x8x16x32x2xbf16>) permutation = [0, 1, 2, 4, 3] - %expanded_7 = tensor.expand_shape %arg3 [[0, 1]] output_shape [32, 32] : tensor<1024xbf16> into tensor<32x32xbf16> - return %transposed_1, %expanded_2, %transposed_6, %expanded_7 : tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16> - } -} - """ - ) - - # module_in entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<512x256xbf16>, %arg2: tensor<256xbf16>, %arg3: tensor<256x1024xbf16>, %arg4: tensor<1024xbf16>) -> tensor<64x1024xbf16> - torch_arg0 = torch.rand((64, 512), dtype=torch.bfloat16) - torch_arg1 = torch.rand((512, 256), dtype=torch.bfloat16) - torch_arg2 = torch.rand((256), dtype=torch.bfloat16) - torch_arg3 = torch.rand((256, 1024), dtype=torch.bfloat16) - torch_arg4 = torch.rand((1024), dtype=torch.bfloat16) - - ref_res = (torch_arg0 @ torch_arg1 + torch_arg2) @ torch_arg3 + torch_arg4 - - passes = "any(gc-cpu-pipeline)" - compiler = GraphCompiler(passes) - ctx.enable_multithreading(False) - - arg0 = torch_arg0.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16) - arg1 = torch_arg1.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16) - arg2 = torch_arg2.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16) - arg3 = torch_arg3.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16) - arg4 = torch_arg4.view(dtype=torch.uint16).numpy().view(ml_dtypes.bfloat16) - gc_res = np.ones((64, 1024), dtype=ml_dtypes.bfloat16) - - entry = "entry" - mlir_args = get_mlir_args(module_in, entry, [arg0, arg1, arg2, arg3, arg4, gc_res]) - engine_in = compiler.compile_and_jit(module_in, ir_printing=True) - engine_in.invoke(entry, *mlir_args) - - assert_allclose(gc_res.astype(np.float32), ref_res.to(torch.float32).numpy(), rtol=1e-5, atol=1e-5) - - - # module_out entry(%arg0: tensor<64x512xbf16>, %arg1: tensor<8x16x16x32x2xbf16>, %arg2: tensor<8x32xbf16>, %arg3: tensor<32x8x16x32x2xbf16>, %arg4: tensor<32x32xbf16>) -> tensor<64x1024xbf16> - # module_out runtime_fold(%arg0: tensor<512x256xbf16>, %arg1: tensor<256xbf16>, %arg2: tensor<256x1024xbf16>, %arg3: tensor<1024xbf16>) -> (tensor<8x16x16x32x2xbf16>, tensor<8x32xbf16>, tensor<32x8x16x32x2xbf16>, tensor<32x32xbf16>) - fold_arg0 = arg1 - fold_arg1 = arg2 - fold_arg2 = arg3 - fold_arg3 = arg4 - fold_res0 = np.zeros((8, 16, 16, 32, 2), dtype=ml_dtypes.bfloat16) - fold_res1 = np.zeros((8, 32), dtype=ml_dtypes.bfloat16) - fold_res2 = np.zeros((32, 8, 16, 32, 2), dtype=ml_dtypes.bfloat16) - fold_res3 = np.zeros((32, 32), dtype=ml_dtypes.bfloat16) - - runtime_fold = "runtime_fold" - fold_mlir_args = get_mlir_args(module_out, runtime_fold, [fold_arg0, fold_arg1, fold_arg2, fold_arg3, fold_res0, fold_res1, fold_res2, fold_res3]) - - gc_res_out = np.zeros((64, 1024), dtype=ml_dtypes.bfloat16) - entry = "entry" - mlir_args = get_mlir_args(module_out, entry, [arg0, fold_res0, fold_res1, fold_res2, fold_res3, gc_res_out]) - - engine_out = compiler.compile_and_jit(module_out, ir_printing=True) - engine_out.invoke(runtime_fold, *fold_mlir_args) - engine_out.invoke(entry, *mlir_args) - - assert_allclose(gc_res.astype(np.float32), gc_res_out.astype(np.float32), rtol=1e-5, atol=1e-5) - diff --git a/test/gc/Transforms/test_constant_tensor_folding_f32_4D4D.py b/test/gc/Transforms/test_constant_tensor_folding_f32_4D4D.py deleted file mode 100644 index 465d390fd..000000000 --- a/test/gc/Transforms/test_constant_tensor_folding_f32_4D4D.py +++ /dev/null @@ -1,96 +0,0 @@ -################################################################################ -# Copyright (C) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions -# and limitations under the License. -# SPDX-License-Identifier: Apache-2.0 -################################################################################ - -from enum import Flag -import os -import sys - -import numpy as np -from gc_mlir import ir -from gc_mlir.graph_compiler import GraphCompiler -from numpy.testing import assert_allclose - -project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -if project_dir not in sys.path: - sys.path.insert(0, project_dir) - -import torch -# from bench import py_timeit_bench -from utils import get_mlir_args - -if __name__ == "__main__": - with ir.Context() as ctx: - ctx.allow_unregistered_dialects = True - - M = 64 - N = 256 - K = 512 - MBlock = 32 - NBlock = 32 - KBlock = 32 - vnni_size = 1 - shapeA = [M // MBlock, K // KBlock, MBlock, KBlock] - shapeB = [N // NBlock, K // KBlock, KBlock, NBlock] - shapeC = [M // MBlock, N // NBlock, MBlock, NBlock] - - # 4D x 4D, inputs transposed - mlir_str = """ -#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> -#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> -#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> -module { - func.func @main_entry(%arg0: tensor<2x16x32x32xf32>, %arg1: tensor<8x16x32x32xf32>) -> tensor<2x8x32x32xf32> attributes {llvm.emit_c_interface} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.empty() : tensor<2x8x32x32xf32> - %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<2x8x32x32xf32>) -> tensor<2x8x32x32xf32> - %2 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %arg1 : tensor<2x16x32x32xf32>, tensor<8x16x32x32xf32>) outs(%1 : tensor<2x8x32x32xf32>) { - ^bb0(%in: f32, %in_0: f32, %out: f32): - %3 = arith.mulf %in, %in_0 : f32 - %4 = arith.addf %out, %3 : f32 - linalg.yield %4 : f32 - } -> tensor<2x8x32x32xf32> - return %2 : tensor<2x8x32x32xf32> - } -} - """ - module = ir.Module.parse(mlir_str) - - torch_arg0 = torch.rand((M, K), dtype=torch.float32) - torch_arg1 = torch.rand((K, N), dtype=torch.float32) - ref_res = torch.matmul(torch_arg0, torch_arg1) - - arg0_0 = torch_arg0.view([M // MBlock, MBlock, K // KBlock, KBlock]).permute([0, 2, 1, 3]).contiguous().numpy().view(np.dtype("float32")) - arg0_1 = np.transpose(np.reshape(torch_arg0.contiguous().numpy().view(np.dtype("float32")), (M // MBlock, MBlock, K // KBlock, KBlock)), (0, 2, 1, 3)) # MK -> MKmk - print("arg0_0 arg0_1 close: ", np.allclose(arg0_0, arg0_1, rtol=1e-5, atol=1e-5)) - - arg1 = torch_arg1.view([K // KBlock, KBlock, N // NBlock, NBlock]).permute([2, 0, 1, 3]).contiguous().numpy().view(np.dtype("float32")) - # arg1 = np.transpose(np.reshape(torch_arg1.contiguous().numpy(), (16, 32, 8, 32)), (2, 0, 1, 3)).view(np.dtype("float32")) # KN -> NKkn, 8x16x32x32 - - gc_res = np.ones(shapeC, dtype=np.dtype("float32")) - - entry = "main_entry" - mlir_args = get_mlir_args(module, entry, [arg0_1, arg1, gc_res]) - - passes = "any(gc-cpu-pipeline)" - compiler = GraphCompiler(passes) - engine_in = compiler.compile_and_jit(module) - engine_in.invoke(entry, *mlir_args) - gc_res = np.reshape(np.transpose(gc_res, (0, 2, 1, 3)), (64, 256)) # MNmn -> MN - - print("gc_res ref_res close: ", np.allclose(gc_res, ref_res.to(torch.float32).numpy(), rtol=1e-5, atol=1e-5)) - assert_allclose(gc_res, ref_res.to(torch.float32).numpy(), rtol=1e-5, atol=1e-5) - diff --git a/test/gc/Transforms/test_constant_tensor_folding_f32_two_layers.py b/test/gc/Transforms/test_constant_tensor_folding_f32_two_layers.py deleted file mode 100644 index e05e2ac15..000000000 --- a/test/gc/Transforms/test_constant_tensor_folding_f32_two_layers.py +++ /dev/null @@ -1,225 +0,0 @@ -################################################################################ -# Copyright (C) 2024 Intel Corporation -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions -# and limitations under the License. -# SPDX-License-Identifier: Apache-2.0 -################################################################################ - -import os -import sys - -import numpy as np -from gc_mlir import ir -from gc_mlir.graph_compiler import GraphCompiler -from numpy.testing import assert_allclose - -project_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) -if project_dir not in sys.path: - sys.path.insert(0, project_dir) - -import torch -# from bench import py_timeit_bench -from utils import get_mlir_args - -if __name__ == "__main__": - with ir.Context() as ctx: - ctx.allow_unregistered_dialects = True - - # 4D x 4D, inputs plain, two layers - mlir_str_4D4D = """ -#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> -#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> -#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> -#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map4 = affine_map<(d0, d1, d2, d3) -> (d1, d3)> -module { - func.func @entry(%arg0: tensor<64x512xf32>, %arg1: tensor<512x256xf32>, %arg2: tensor<256xf32>, %arg3: tensor<256x1024xf32>, %arg4: tensor<1024xf32>) -> tensor<64x1024xf32> attributes {llvm.emit_c_interface, runtime_const_args_index = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { - %0 = tensor.empty() : tensor<2x16x32x32xf32> - %cst = arith.constant 0.000000e+00 : f32 - %padded = tensor.pad %arg0 low[0, 0] high[0, 0] { - ^bb0(%arg5: index, %arg6: index): - tensor.yield %cst : f32 - } : tensor<64x512xf32> to tensor<64x512xf32> - %expanded = tensor.expand_shape %padded [[0, 1], [2, 3]] output_shape [2, 32, 16, 32] : tensor<64x512xf32> into tensor<2x32x16x32xf32> - %transposed = linalg.transpose ins(%expanded : tensor<2x32x16x32xf32>) outs(%0 : tensor<2x16x32x32xf32>) permutation = [0, 2, 1, 3] - %1 = tensor.empty() : tensor<8x16x32x32xf32> - %padded_0 = tensor.pad %arg1 low[0, 0] high[0, 0] { - ^bb0(%arg5: index, %arg6: index): - tensor.yield %cst : f32 - } : tensor<512x256xf32> to tensor<512x256xf32> - %expanded_1 = tensor.expand_shape %padded_0 [[0, 1], [2, 3]] output_shape [16, 32, 8, 32] : tensor<512x256xf32> into tensor<16x32x8x32xf32> - %transposed_2 = linalg.transpose ins(%expanded_1 : tensor<16x32x8x32xf32>) outs(%1 : tensor<8x16x32x32xf32>) permutation = [2, 0, 1, 3] - %2 = tensor.empty() : tensor<2x8x32x32xf32> - %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2x8x32x32xf32>) -> tensor<2x8x32x32xf32> - %4 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%transposed, %transposed_2 : tensor<2x16x32x32xf32>, tensor<8x16x32x32xf32>) outs(%3 : tensor<2x8x32x32xf32>) { - ^bb0(%in: f32, %in_8: f32, %out: f32): - %14 = arith.mulf %in, %in_8 : f32 - %15 = arith.addf %out, %14 : f32 - linalg.yield %15 : f32 - } -> tensor<2x8x32x32xf32> - %expanded_3 = tensor.expand_shape %arg2 [[0, 1]] output_shape [8, 32] : tensor<256xf32> into tensor<8x32xf32> - %broadcasted = linalg.broadcast ins(%expanded_3 : tensor<8x32xf32>) outs(%2 : tensor<2x8x32x32xf32>) dimensions = [0, 2] - %5 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%broadcasted : tensor<2x8x32x32xf32>) outs(%4 : tensor<2x8x32x32xf32>) { - ^bb0(%in: f32, %out: f32): - %14 = arith.addf %in, %out : f32 - linalg.yield %14 : f32 - } -> tensor<2x8x32x32xf32> - %6 = tensor.empty() : tensor<32x8x32x32xf32> - %expanded_4 = tensor.expand_shape %arg3 [[0, 1], [2, 3]] output_shape [8, 32, 32, 32] : tensor<256x1024xf32> into tensor<8x32x32x32xf32> - %transposed_5 = linalg.transpose ins(%expanded_4 : tensor<8x32x32x32xf32>) outs(%6 : tensor<32x8x32x32xf32>) permutation = [2, 0, 1, 3] - %7 = tensor.empty() : tensor<2x32x32x32xf32> - %8 = linalg.fill ins(%cst : f32) outs(%7 : tensor<2x32x32x32xf32>) -> tensor<2x32x32x32xf32> - %9 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%5, %transposed_5 : tensor<2x8x32x32xf32>, tensor<32x8x32x32xf32>) outs(%8 : tensor<2x32x32x32xf32>) { - ^bb0(%in: f32, %in_8: f32, %out: f32): - %14 = arith.mulf %in, %in_8 : f32 - %15 = arith.addf %out, %14 : f32 - linalg.yield %15 : f32 - } -> tensor<2x32x32x32xf32> - %expanded_6 = tensor.expand_shape %arg4 [[0, 1]] output_shape [32, 32] : tensor<1024xf32> into tensor<32x32xf32> - %10 = linalg.generic {indexing_maps = [#map4, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%expanded_6 : tensor<32x32xf32>) outs(%9 : tensor<2x32x32x32xf32>) { - ^bb0(%in: f32, %out: f32): - %14 = arith.addf %in, %out : f32 - linalg.yield %14 : f32 - } -> tensor<2x32x32x32xf32> - %11 = tensor.empty() : tensor<2x32x32x32xf32> - %transposed_7 = linalg.transpose ins(%10 : tensor<2x32x32x32xf32>) outs(%11 : tensor<2x32x32x32xf32>) permutation = [0, 2, 1, 3] - %collapsed = tensor.collapse_shape %transposed_7 [[0, 1], [2, 3]] : tensor<2x32x32x32xf32> into tensor<64x1024xf32> - %12 = tensor.empty() : tensor<64x1024xf32> - %13 = linalg.copy ins(%collapsed : tensor<64x1024xf32>) outs(%12 : tensor<64x1024xf32>) -> tensor<64x1024xf32> - return %13 : tensor<64x1024xf32> - } -} - """ - - module_in = ir.Module.parse(mlir_str_4D4D) - - - mlir_str_4D4D_out = """ -#map = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)> -#map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> -#map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)> -#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> -#map4 = affine_map<(d0, d1, d2, d3) -> (d1, d3)> -module { - llvm.mlir.global external constant @__num_orig_args(5 : i32) {addr_space = 0 : i32} : i32 - llvm.mlir.global external constant @__compute_args(dense<[5, 0, 5, 6, 7, 8]> : tensor<6xi32>) {addr_space = 0 : i32} : !llvm.array<6 x i32> - llvm.mlir.global external constant @__fold_args(dense<[8, 1, 2, 3, 4, 5, 6, 7, 8]> : tensor<9xi32>) {addr_space = 0 : i32} : !llvm.array<9 x i32> - llvm.mlir.global external constant @__runtime_fold_buffer_ids_(dense<[4, 0, 1, 2, 3]> : tensor<5xi64>) {addr_space = 0 : i32} : !llvm.array<5 x i64> - func.func @entry(%arg0: tensor<64x512xf32>, %arg1: tensor<8x16x32x32xf32>, %arg2: tensor<8x32xf32>, %arg3: tensor<32x8x32x32xf32>, %arg4: tensor<32x32xf32>) -> tensor<64x1024xf32> attributes {llvm.emit_c_interface, runtime_const_args_index = [1 : i32, 2 : i32, 3 : i32, 4 : i32]} { - %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.empty() : tensor<2x16x32x32xf32> - %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [2, 32, 16, 32] : tensor<64x512xf32> into tensor<2x32x16x32xf32> - %transposed = linalg.transpose ins(%expanded : tensor<2x32x16x32xf32>) outs(%0 : tensor<2x16x32x32xf32>) permutation = [0, 2, 1, 3] - %1 = tensor.empty() : tensor<2x8x32x32xf32> - %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<2x8x32x32xf32>) -> tensor<2x8x32x32xf32> - %3 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%transposed, %arg1 : tensor<2x16x32x32xf32>, tensor<8x16x32x32xf32>) outs(%2 : tensor<2x8x32x32xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %12 = arith.mulf %in, %in_1 : f32 - %13 = arith.addf %out, %12 : f32 - linalg.yield %13 : f32 - } -> tensor<2x8x32x32xf32> - %broadcasted = linalg.broadcast ins(%arg2 : tensor<8x32xf32>) outs(%1 : tensor<2x8x32x32xf32>) dimensions = [0, 2] - %4 = linalg.generic {indexing_maps = [#map3, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%broadcasted : tensor<2x8x32x32xf32>) outs(%3 : tensor<2x8x32x32xf32>) { - ^bb0(%in: f32, %out: f32): - %12 = arith.addf %in, %out : f32 - linalg.yield %12 : f32 - } -> tensor<2x8x32x32xf32> - %5 = tensor.empty() : tensor<2x32x32x32xf32> - %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32x32x32xf32>) -> tensor<2x32x32x32xf32> - %7 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%4, %arg3 : tensor<2x8x32x32xf32>, tensor<32x8x32x32xf32>) outs(%6 : tensor<2x32x32x32xf32>) { - ^bb0(%in: f32, %in_1: f32, %out: f32): - %12 = arith.mulf %in, %in_1 : f32 - %13 = arith.addf %out, %12 : f32 - linalg.yield %13 : f32 - } -> tensor<2x32x32x32xf32> - %8 = linalg.generic {indexing_maps = [#map4, #map3], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%arg4 : tensor<32x32xf32>) outs(%7 : tensor<2x32x32x32xf32>) { - ^bb0(%in: f32, %out: f32): - %12 = arith.addf %in, %out : f32 - linalg.yield %12 : f32 - } -> tensor<2x32x32x32xf32> - %9 = tensor.empty() : tensor<2x32x32x32xf32> - %transposed_0 = linalg.transpose ins(%8 : tensor<2x32x32x32xf32>) outs(%9 : tensor<2x32x32x32xf32>) permutation = [0, 2, 1, 3] - %collapsed = tensor.collapse_shape %transposed_0 [[0, 1], [2, 3]] : tensor<2x32x32x32xf32> into tensor<64x1024xf32> - %10 = tensor.empty() : tensor<64x1024xf32> - %11 = linalg.copy ins(%collapsed : tensor<64x1024xf32>) outs(%10 : tensor<64x1024xf32>) -> tensor<64x1024xf32> - return %11 : tensor<64x1024xf32> - } - - func.func @runtime_fold(%arg0: tensor<512x256xf32>, %arg1: tensor<256xf32>, %arg2: tensor<256x1024xf32>, %arg3: tensor<1024xf32>) -> (tensor<8x16x32x32xf32>, tensor<8x32xf32>, tensor<32x8x32x32xf32>, tensor<32x32xf32>) attributes {llvm.emit_c_interface} { - %0 = tensor.empty() : tensor<8x16x32x32xf32> - %expanded = tensor.expand_shape %arg0 [[0, 1], [2, 3]] output_shape [16, 32, 8, 32] : tensor<512x256xf32> into tensor<16x32x8x32xf32> - %transposed = linalg.transpose ins(%expanded : tensor<16x32x8x32xf32>) outs(%0 : tensor<8x16x32x32xf32>) permutation = [2, 0, 1, 3] - %expanded_0 = tensor.expand_shape %arg1 [[0, 1]] output_shape [8, 32] : tensor<256xf32> into tensor<8x32xf32> - %1 = tensor.empty() : tensor<32x8x32x32xf32> - %expanded_1 = tensor.expand_shape %arg2 [[0, 1], [2, 3]] output_shape [8, 32, 32, 32] : tensor<256x1024xf32> into tensor<8x32x32x32xf32> - %transposed_2 = linalg.transpose ins(%expanded_1 : tensor<8x32x32x32xf32>) outs(%1 : tensor<32x8x32x32xf32>) permutation = [2, 0, 1, 3] - %expanded_3 = tensor.expand_shape %arg3 [[0, 1]] output_shape [32, 32] : tensor<1024xf32> into tensor<32x32xf32> - return %transposed, %expanded_0, %transposed_2, %expanded_3 : tensor<8x16x32x32xf32>, tensor<8x32xf32>, tensor<32x8x32x32xf32>, tensor<32x32xf32> - } -} - """ - module_out = ir.Module.parse(mlir_str_4D4D_out) - - # module_in entry(%arg0: tensor<64x512xf32>, %arg1: tensor<512x256xf32>, %arg2: tensor<256xf32>, %arg3: tensor<256x1024xf32>, %arg4: tensor<1024xf32>) -> tensor<64x1024xf32> - torch_arg0 = torch.rand((64, 512), dtype=torch.float32) - torch_arg1 = torch.rand((512, 256), dtype=torch.float32) - torch_arg2 = torch.rand((256), dtype=torch.float32) - torch_arg3 = torch.rand((256, 1024), dtype=torch.float32) - torch_arg4 = torch.rand((1024), dtype=torch.float32) - - ref_res = (torch_arg0 @ torch_arg1 + torch_arg2) @ torch_arg3 + torch_arg4 - - passes = "any(gc-cpu-pipeline)" - compiler = GraphCompiler(passes) - ctx.enable_multithreading(False) - - arg0 = torch_arg0.contiguous().numpy() - arg1 = torch_arg1.contiguous().numpy() - arg2 = torch_arg2.contiguous().numpy() - arg3 = torch_arg3.contiguous().numpy() - arg4 = torch_arg4.contiguous().numpy() - gc_res = np.zeros((64, 1024), dtype=np.float32) - - entry = "entry" - mlir_args = get_mlir_args(module_in, entry, [arg0, arg1, arg2, arg3, arg4, gc_res]) - engine_in = compiler.compile_and_jit(module_in, ir_printing=False) - engine_in.invoke(entry, *mlir_args) - - print("Reference vs GC input IR close: ", np.allclose(gc_res, ref_res.to(torch.float32).numpy(), rtol=1e-5, atol=1e-5)) - assert_allclose(gc_res, ref_res.to(torch.float32).numpy(), rtol=1e-5, atol=1e-5) - - - # module_out entry(%arg0: tensor<64x512xf32>, %arg1: tensor<8x16x32x32xf32>, %arg2: tensor<8x32xf32>, %arg3: tensor<32x8x32x32xf32>, %arg4: tensor<32x32xf32>) -> tensor<64x1024xf32> - # module_out runtime_fold(%arg0: tensor<512x256xf32>, %arg1: tensor<256xf32>, %arg2: tensor<256x1024xf32>, %arg3: tensor<1024xf32>) -> (tensor<8x16x32x32xf32>, tensor<8x32xf32>, tensor<32x8x32x32xf32>, tensor<32x32xf32>) - fold_arg0 = arg1 - fold_arg1 = arg2 - fold_arg2 = arg3 - fold_arg3 = arg4 - fold_res0 = np.zeros((8, 16, 32, 32), dtype=np.float32) - fold_res1 = np.zeros((8, 32), dtype=np.float32) - fold_res2 = np.zeros((32, 8, 32, 32), dtype=np.float32) - fold_res3 = np.zeros((32, 32), dtype=np.float32) - - runtime_fold = "runtime_fold" - fold_mlir_args = get_mlir_args(module_out, runtime_fold, [fold_arg0, fold_arg1, fold_arg2, fold_arg3, fold_res0, fold_res1, fold_res2, fold_res3]) - - gc_res_out = np.zeros((64, 1024), dtype=np.float32) - entry = "entry" - entry_mlir_args = get_mlir_args(module_out, entry, [arg0, fold_res0, fold_res1, fold_res2, fold_res3, gc_res_out]) - - engine_out = compiler.compile_and_jit(module_out, ir_printing=False) - engine_out.invoke(runtime_fold, *fold_mlir_args) - engine_out.invoke(entry, *entry_mlir_args) - - print("GC input IR vs GC output IR close: ", np.allclose(gc_res, gc_res_out, rtol=1e-5, atol=1e-5)) - assert_allclose(gc_res, gc_res_out, rtol=1e-5, atol=1e-5) From fa30e4a5f5d9546a2fe2c29fc1e409e28c3ac782 Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Sat, 14 Sep 2024 11:33:00 +0800 Subject: [PATCH 27/29] Updates --- .../DataFlow/ConstantSubgraphAnalyser.cpp | 11 ++- lib/gc/Transforms/ConstantTensorFolding.cpp | 73 ++++++++----------- 2 files changed, 36 insertions(+), 48 deletions(-) diff --git a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp index e4a2130f3..b3c6b51ba 100644 --- a/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp +++ b/lib/gc/Analysis/DataFlow/ConstantSubgraphAnalyser.cpp @@ -143,9 +143,9 @@ void RunConstantSubgraphAnalyser::getConstantSubgraph(DataFlowSolver &solver, for (Operation &op : llvm::make_early_inc_range(block)) { // If all the result values of a op are const, we mark this op as const. bool resultsAllConstant = true; - if (op.getNumResults() == 0) { + if (op.getNumResults() == 0) continue; - } + for (Value res : op.getResults()) { auto *lattice = solver.lookupState>(res); if (!lattice || lattice->getValue().isUninitialized()) { @@ -164,9 +164,8 @@ void RunConstantSubgraphAnalyser::getConstantSubgraph(DataFlowSolver &solver, } } - if (constantOperations.empty()) { + if (constantOperations.empty()) return; - } } RunConstantSubgraphAnalyser::RunConstantSubgraphAnalyser() { @@ -175,9 +174,9 @@ RunConstantSubgraphAnalyser::RunConstantSubgraphAnalyser() { } void RunConstantSubgraphAnalyser::run(Operation *op) { - if (failed(solver.initializeAndRun(op))) { + if (failed(solver.initializeAndRun(op))) return; - } + getConstantSubgraph(solver, op); } diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp index 17270d54f..b093ffd2c 100644 --- a/lib/gc/Transforms/ConstantTensorFolding.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -69,9 +69,9 @@ template int64_t getDataSize(T t) { unsigned bitWidth = eleType.getIntOrFloatBitWidth() / 8; // bytes ArrayRef shape = t.getShape(); int64_t size = bitWidth; - for (auto s : shape) { + for (auto s : shape) size *= s; - } + return size; } @@ -94,13 +94,12 @@ bool singleOperand(Operation *op) { Value firstOperand = op->getOperand(0); for (int64_t i = 1; i < op->getNumOperands(); ++i) { Value operand = op->getOperand(i); - if (firstOperand == operand) { + if (firstOperand == operand) continue; - } + auto parentOp = operand.getDefiningOp(); - if (parentOp && !isa(parentOp)) { + if (parentOp && !isa(parentOp)) return false; - } } } return true; @@ -121,16 +120,14 @@ bool canMoveBefore(Operation *op) { SmallVector indexingMaps = linalgOp.getIndexingMapsArray(); for (auto &affineMap : indexingMaps) { - if (!affineMap.isIdentity()) { + if (!affineMap.isIdentity()) return false; - } } SmallVector iterTypes = linalgOp.getIteratorTypesArray(); for (auto &iterType : iterTypes) { - if (iterType != utils::IteratorType::parallel) { + if (iterType != utils::IteratorType::parallel) return false; - } } if (op->getNumOperands() > 1) { @@ -140,9 +137,8 @@ bool canMoveBefore(Operation *op) { for (int64_t i = 0; i < numInits; ++i) { OpOperand *outOperand = linalgOp.getDpsInitOperand(i); auto parentOp = outOperand->get().getDefiningOp(); - if (!isa(parentOp)) { + if (!isa(parentOp)) return false; - } } } @@ -156,9 +152,8 @@ void postponeBroadcast(Block &block) { for (Operation &op : block.getOperations()) { if (isa(&op)) { Operation *bcOp = &op; - if (isInConstantSubgraph(bcOp)) { + if (isInConstantSubgraph(bcOp)) constBcOps.push_back(bcOp); - } } } @@ -172,9 +167,9 @@ void postponeBroadcast(Block &block) { SmallVector prevOps; Operation *currOp = bcOp; while (true) { - if (currOp->getNumOperands() != 1) { + if (currOp->getNumOperands() != 1) break; - } + Value operand = currOp->getOperand(0); if (isa(operand)) { break; @@ -188,9 +183,9 @@ void postponeBroadcast(Block &block) { SmallVector postOps; currOp = bcOp; while (true) { - if (currOp->getNumResults() != 1 || !currOp->hasOneUse()) { + if (currOp->getNumResults() != 1 || !currOp->hasOneUse()) break; - } + Value input = currOp->getResult(0); currOp = *(input.getUsers().begin()); Value output = currOp->getResult(0); @@ -212,9 +207,8 @@ void postponeBroadcast(Block &block) { postOps.push_back(currOp); } } - if (postOps.empty()) { + if (postOps.empty()) continue; - } // move bcOp after the last constant op SmallVector newPostOps; @@ -308,17 +302,12 @@ void postponeBroadcast(Block &block) { return op == bcOp; }); - for (auto it = postOps.rbegin(); it != postOps.rend(); ++it) { + for (auto it = postOps.rbegin(); it != postOps.rend(); ++it) (*it)->erase(); - } } } -static constexpr int DATA_SIZE_EXPANDING_THRESHOLD = 8; - -// get from dnnl_graph_compiler_context -// void *allocator(size_t size) { return std::aligned_alloc(64, size); } -// void deallocator(void *ptr) { std::free(ptr); } +// TODO: The following manager will be moved to appropriate place later. // std::shared_ptr createConstCacheProxy(size_t size) { // // simply allocate buffer and return @@ -331,9 +320,7 @@ size_t divideAndCeil(size_t x, size_t y) { return (x + y - 1) / y; } // Manager struct ConstGraphTensorCacheManager { - // dnnl_graph_compiler_context *ctx; - - uint64_t cachedTensorGlobalId = 0; + int64_t cachedTensorGlobalId = 0; // singleton static std::shared_ptr get() { @@ -343,14 +330,14 @@ struct ConstGraphTensorCacheManager { } // alloc and set the buf_base_ and offset_ attributes of cache - std::vector alloc(std::vector buffersSize) { + std::vector alloc(std::vector buffersSize) { size_t totalSize = 0; - for (size_t size : buffersSize) { + for (size_t size : buffersSize) totalSize += divideAndCeil(size, 64) * 64; - } + LLVM_DEBUG(llvm::dbgs() << "Alloc total size: " << totalSize << '\n'); // auto base = createConstCacheProxy(totalSize); - std::vector globalIds(buffersSize.size()); + std::vector globalIds(buffersSize.size()); size_t offset = 0; for (size_t i = 0; i < buffersSize.size(); i++) { LLVM_DEBUG(llvm::dbgs() << "Alloc offset: " << offset << '\n'); @@ -427,9 +414,9 @@ void getArithConstantOutputs(Block &block, SmallVector &outputTypes, if (isa(&op)) { Operation *constOp = &op; auto constTensor = constOp->getResults().front(); - if (!isa(constTensor.getType())) { + if (!isa(constTensor.getType())) continue; - } + auto v = dyn_cast(constTensor); SmallVector valuesOnTheWay = {v}; // the constant tensors std::deque dq; @@ -465,6 +452,8 @@ void getArithConstantOutputs(Block &block, SmallVector &outputTypes, } } +static constexpr int DATA_SIZE_EXPANDING_THRESHOLD = 8; + void getInputsAndOutputs(Block &block, std::unordered_set &constArgsIndexes, SmallVector &inputTypes, @@ -511,15 +500,15 @@ void getInputsAndOutputs(Block &block, } continue; } - if (!v.hasOneUse()) { + if (!v.hasOneUse()) simpleTopo = false; - } + // the children ops of v are all constant, we push their results to // queue for (Operation *child : v.getUsers()) { - if (!singleOperand(child) || child->getResults().size() > 1) { + if (!singleOperand(child) || child->getResults().size() > 1) simpleTopo = false; - } + for (OpResult result : child->getResults()) { auto r = dyn_cast(result); dq.push_back(r); @@ -596,9 +585,9 @@ func::FuncOp buildFoldFunc(MLIRContext *context, OpBuilder &builder, } auto manager = ConstGraphTensorCacheManager::get(); SmallVector globalIndexes; - for (auto id : manager->alloc(buffersSize)) { + for (auto id : manager->alloc(buffersSize)) globalIndexes.push_back(id); - } + globalIndexes.insert(globalIndexes.begin(), globalIndexes.size()); auto moduleOp = dyn_cast(topOp); addGlobalI64Array(moduleOp, moduleOp.getLoc(), builder, From 77e0f0258f4ccd0d87b8824132dca5718474e6fc Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Wed, 18 Sep 2024 15:12:18 +0800 Subject: [PATCH 28/29] Merge into one pass --- include/gc/Transforms/Passes.h | 1 - include/gc/Transforms/Passes.td | 8 --- lib/gc/Transforms/CMakeLists.txt | 1 - .../Transforms/ConstantSubgraphAnalysis.cpp | 54 ------------------- lib/gc/Transforms/ConstantTensorFolding.cpp | 5 ++ lib/gc/Transforms/Pipeline.cpp | 2 - .../test_constant_tensor_folding-0.mlir | 2 +- .../test_constant_tensor_folding-1.mlir | 2 +- 8 files changed, 7 insertions(+), 68 deletions(-) delete mode 100644 lib/gc/Transforms/ConstantSubgraphAnalysis.cpp diff --git a/include/gc/Transforms/Passes.h b/include/gc/Transforms/Passes.h index a42dba87b..06a3ee83d 100644 --- a/include/gc/Transforms/Passes.h +++ b/include/gc/Transforms/Passes.h @@ -124,7 +124,6 @@ void populateGPUPipeline(mlir::OpPassManager &); #define GEN_PASS_DECL_CONSTANTTENSORFOLDING #include "gc/Transforms/Passes.h.inc" -std::unique_ptr createConstantSubgraphAnalysisPass(); std::unique_ptr createConstantTensorFoldingPass(); #define GEN_PASS_REGISTRATION diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td index 034380323..9a968c3bd 100644 --- a/include/gc/Transforms/Passes.td +++ b/include/gc/Transforms/Passes.td @@ -169,14 +169,6 @@ def MergeNestedForall : Pass<"merge-nested-forall"> { let dependentDialects = ["scf::SCFDialect"]; } -def ConstantSubgraphAnalysis : Pass<"constant-subgraph-analysis"> { - let summary = "Constant Subgraph Analysis"; - let description = [{ - This pass implements a constant subgraph analysis. - }]; - let constructor = "mlir::gc::createConstantSubgraphAnalysisPass()"; -} - def ConstantTensorFolding : Pass<"constant-tensor-folding"> { let summary = "Constant Tensor Folding Transform"; let description = [{ diff --git a/lib/gc/Transforms/CMakeLists.txt b/lib/gc/Transforms/CMakeLists.txt index 44415fece..08d60e513 100644 --- a/lib/gc/Transforms/CMakeLists.txt +++ b/lib/gc/Transforms/CMakeLists.txt @@ -16,7 +16,6 @@ gc_add_mlir_library(GcPasses IterativeTilingAndFusion.cpp TilingUsingInterfaceX.cpp VerifyTargetDescription.cpp - ConstantSubgraphAnalysis.cpp ConstantTensorFolding.cpp DecomposeAggregatedOps.cpp DeepTileContractionOp.cpp diff --git a/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp b/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp deleted file mode 100644 index 511d76f21..000000000 --- a/lib/gc/Transforms/ConstantSubgraphAnalysis.cpp +++ /dev/null @@ -1,54 +0,0 @@ -//===-- ConstantSubgraphAnalysis.cpp - Constant Subgraph --------*- C++ -*-===// -// -// This file is licensed under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This transformation pass performs a constant subgraph analysis -// in MLIR. -// -//===----------------------------------------------------------------------===// -#include "gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h" -#include "mlir/IR/Builders.h" -#include "mlir/IR/Dialect.h" -#include "mlir/Pass/Pass.h" -#include "mlir/Transforms/Passes.h" - -namespace mlir { -namespace gc { -#define GEN_PASS_DEF_CONSTANTSUBGRAPHANALYSIS -#include "gc/Transforms/Passes.h.inc" -} // namespace gc - -using namespace mlir; -using namespace mlir::dataflow; - -namespace gc { - -struct ConstantSubgraphAnalysis - : public impl::ConstantSubgraphAnalysisBase { - void runOnOperation() override; -}; - -void ConstantSubgraphAnalysis::runOnOperation() { - Operation *op = getOperation(); - auto &func = - op->getRegions().front().getBlocks().front().getOperations().front(); - - // Hard-code example: set some arguments to be constant. - // OpBuilder builder(op->getContext()); - // func.setAttr("runtime_const_args_index", - // builder.getI32ArrayAttr({1,2,3,4})); - - RunConstantSubgraphAnalyser runAnalyser; - (void)runAnalyser.run(&func); -} - -std::unique_ptr createConstantSubgraphAnalysisPass() { - return std::make_unique(); -} - -} // namespace gc -} // namespace mlir \ No newline at end of file diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp index b093ffd2c..6000ec844 100644 --- a/lib/gc/Transforms/ConstantTensorFolding.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -13,6 +13,7 @@ #include #include +#include "gc/Analysis/DataFlow/ConstantSubgraphAnalyser.h" #include "mlir/Transforms/Passes.h" #include "mlir/Dialect/Arith/IR/Arith.h" @@ -766,6 +767,10 @@ void ConstantTensorFolding::runOnOperation() { MLIRContext *context = topOp->getContext(); auto &topFunc = topOp->getRegions().front().getBlocks().front().getOperations().front(); + + dataflow::RunConstantSubgraphAnalyser runAnalyser; + (void)runAnalyser.run(&topFunc); + OpBuilder builder(context); Region ®ion = topFunc.getRegions().front(); Block &block = region.getBlocks().front(); diff --git a/lib/gc/Transforms/Pipeline.cpp b/lib/gc/Transforms/Pipeline.cpp index 4cd1e9272..40527f644 100644 --- a/lib/gc/Transforms/Pipeline.cpp +++ b/lib/gc/Transforms/Pipeline.cpp @@ -52,8 +52,6 @@ void populateFrontendPasses(mlir::OpPassManager &pm) { void populateTensorPasses(mlir::OpPassManager &pm) { // todo: padding propagation pass // todo: layout propagation pass - // todo: tensor constant propagation pass - pm.addPass(createConstantSubgraphAnalysisPass()); pm.addPass(createConstantTensorFoldingPass()); // linalg.matmul lowering to (scf.loop + linalg.brgemm) pass pm.addNestedPass(createDeepTileContractionOp()); diff --git a/test/gc/Transforms/test_constant_tensor_folding-0.mlir b/test/gc/Transforms/test_constant_tensor_folding-0.mlir index eabdacc93..155e0875e 100644 --- a/test/gc/Transforms/test_constant_tensor_folding-0.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding-0.mlir @@ -1,4 +1,4 @@ -// RUN: gc-opt --split-input-file -pass-pipeline="builtin.module(constant-subgraph-analysis,constant-tensor-folding)" %s | FileCheck %s +// RUN: gc-opt --split-input-file -pass-pipeline="builtin.module(constant-tensor-folding)" %s | FileCheck %s // COM:A complete example of compile-time and runtime folding. diff --git a/test/gc/Transforms/test_constant_tensor_folding-1.mlir b/test/gc/Transforms/test_constant_tensor_folding-1.mlir index 92231703d..ca70f8d6a 100644 --- a/test/gc/Transforms/test_constant_tensor_folding-1.mlir +++ b/test/gc/Transforms/test_constant_tensor_folding-1.mlir @@ -1,4 +1,4 @@ -// RUN: gc-opt --split-input-file -pass-pipeline="builtin.module(constant-subgraph-analysis,constant-tensor-folding)" %s | FileCheck %s +// RUN: gc-opt --split-input-file -pass-pipeline="builtin.module(constant-tensor-folding)" %s | FileCheck %s // COM: Test the 'postponeBroadcast' feature of constant tensor folding. From 2df16c29b8dbadb4b31d812475599d166872b51e Mon Sep 17 00:00:00 2001 From: "Niu, Xiaoguang" Date: Wed, 18 Sep 2024 15:46:18 +0800 Subject: [PATCH 29/29] Skip case --- lib/gc/Transforms/ConstantTensorFolding.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/gc/Transforms/ConstantTensorFolding.cpp b/lib/gc/Transforms/ConstantTensorFolding.cpp index 6000ec844..3fa85a496 100644 --- a/lib/gc/Transforms/ConstantTensorFolding.cpp +++ b/lib/gc/Transforms/ConstantTensorFolding.cpp @@ -431,6 +431,9 @@ void getArithConstantOutputs(Block &block, SmallVector &outputTypes, [](Operation *child) { return !isInConstantSubgraph(child); })) { + if (valuesOnTheWay.size() == 1) { + continue; + } if (std::find(outputValues.begin(), outputValues.end(), v) == outputValues.end()) { outputTypes.push_back(v.getType());