use CRTP and type trait to avoid virtual function to improve compile performance

BRUCE11111 · BRUCE11111 · commit 0e7794c4c4f9 · 2024-09-24T09:11:53.000+08:00
diff --git a/include/gc/Analysis/VectorBasedFusionAnalysis.h b/include/gc/Analysis/VectorBasedFusionAnalysis.h
@@ -10,6 +10,7 @@
 #define MLIR_ANALYSIS_VECTORBASEDFUSIONANALYSIS_H
 
 #include "gc/Dialect/Linalgx/LinalgxOps.h"
+#include "gc/Dialect/Linalgx/Utils.h"
 #include "gc/Dialect/Microkernel/MicrokernelOps.h"
 #include "gc/Transforms/Passes.h"
 #include "gc/Transforms/Utils/VectorUtils.h"
@@ -28,8 +29,7 @@ namespace gc {
 
 /// record hardware information
 struct HardWareInfo {
-  bool favx512f = true;
-  bool favx2 = true;
+  size_t vectorWidth = 0;
 };
 
 /// Vector type conversion helper class
@@ -66,6 +66,7 @@ enum class ReturnTypeKind {
   RT_InGroup,
 };
 
+/// Base class of vector-based fusion.
 class VectorFusionBase {
 
 private:
@@ -257,16 +258,19 @@ Operation *GroupOperationFusion::getNextTargetOperationInCurrentGroup(
 
   while (!tmpOpQueue.empty()) {
     auto frontOp = tmpOpQueue.front();
-    if (isa<Target>(frontOp)) {
-      for (auto x : frontOp->getOperands())
-        if (x.getDefiningOp() == curOp)
-          return frontOp;
-    }
     tmpOpQueue.pop();
+    if (not isa<Target>(frontOp))
+      continue;
+    for (auto x : frontOp->getOperands())
+      if (x.getDefiningOp() == curOp)
+        return frontOp;
   }
   return nullptr;
 }
 
+/// Analysis each operation group class.
+/// Currently it will run vector-base fusion, analysis empty group and each
+/// operation group's max vectorized step.
 class GroupOperationAnalysis {
 private:
   /// vector-based fusion related data
@@ -282,7 +286,7 @@ class GroupOperationAnalysis {
   void analysisGroupMaxSteps();
   /// get fusion strategy
   GroupOperationFusion &getGroupOperationFusion() { return fusionStrategy; }
-
+  /// running the vector-based fusion
   void run() { fusionStrategy.run(); }
 };
 } // namespace gc
diff --git a/include/gc/Transforms/TilingVector.h b/include/gc/Transforms/TilingVector.h
@@ -18,6 +18,7 @@
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/Transforms/Transforms.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/Dialect/Vector/Transforms/LoweringPatterns.h"
 #include "mlir/Dialect/Vector/Transforms/Passes.h"
 #include "mlir/IR/Visitors.h"
@@ -120,9 +121,30 @@ struct GenerateLoopHelper {
 //===----------------------------------------------------------------------===//
 // vectorize operation class
 //===----------------------------------------------------------------------===//
+class MultiReductionCanonicalizer;
+class BroadcastCanonicalizer;
+class TransposeCanonicalizer;
+class ShapeCastCanonicalizer;
+
+// fixed extraction trait
+template <typename T> struct SpecialOpTraits;
+template <> struct SpecialOpTraits<vector::MultiDimReductionOp> {
+  using DerivedSpecialT = MultiReductionCanonicalizer;
+};
+template <> struct SpecialOpTraits<vector::BroadcastOp> {
+  using DerivedSpecialT = BroadcastCanonicalizer;
+};
+template <> struct SpecialOpTraits<vector::TransposeOp> {
+  using DerivedSpecialT = TransposeCanonicalizer;
+};
+template <> struct SpecialOpTraits<vector::ShapeCastOp> {
+  using DerivedSpecialT = ShapeCastCanonicalizer;
+};
 
 /// base class of special operation
 template <class T> class SpecialOperationCanonicalizer {
+  using DerivedT = typename SpecialOpTraits<T>::DerivedSpecialT;
+
 private:
   /// store current special operation
   SmallVector<T, 4> candidateRdOps;
@@ -148,9 +170,12 @@ template <class T> class SpecialOperationCanonicalizer {
   SpecialOperationCanonicalizer(const SmallVector<T, 4> &candidateRdOps,
                                 SpecialOperationKind kind, size_t step)
       : candidateRdOps(candidateRdOps), vectorStep(step), kind(kind) {}
-  llvm::SmallVector<T, 4> &getCandidateOps();
+  SmallVector<T, 4> &getCandidateOps();
   virtual ~SpecialOperationCanonicalizer() {}
-  virtual void prepareSpecialOperationInfo() = 0;
+  /// call derived speical operation init information methods
+  void prepareSpecialOperationInfo() {
+    static_cast<DerivedT *>(this)->prepareSpecialInfo();
+  }
   /// get kind of speical operation
   SpecialOperationKind getKind() noexcept { return kind; }
   /// set current operation group vectorize step
@@ -241,7 +266,7 @@ class MultiReductionCanonicalizer
 
   /// initalize parallel, reduction axis, reduction operation type and whether
   /// last dim is reduction axis
-  void prepareSpecialOperationInfo() override;
+  void prepareSpecialInfo();
 
   static bool classof(SpecialOperationCanonicalizer *canonicalizer) {
     return canonicalizer->getKind() ==
@@ -259,7 +284,7 @@ class BroadcastCanonicalizer
       : SpecialOperationCanonicalizer<vector::BroadcastOp>(
             candidateBcOps, SpecialOperationKind::OP_Broadcast, steps){};
   virtual ~BroadcastCanonicalizer() noexcept {}
-  void prepareSpecialOperationInfo() override {}
+  void prepareSpecialInfo(){};
   static bool classof(SpecialOperationCanonicalizer *canonicalizer) {
     return canonicalizer->getKind() == SpecialOperationKind::OP_Broadcast;
   }
@@ -278,7 +303,7 @@ class TransposeCanonicalizer
       : SpecialOperationCanonicalizer<vector::TransposeOp>(
             candidateTpOps, SpecialOperationKind::OP_Transpose, steps){};
   virtual ~TransposeCanonicalizer() noexcept {}
-  void prepareSpecialOperationInfo() override{};
+  void prepareSpecialInfo(){};
   static bool classof(SpecialOperationCanonicalizer *canonicalizer) {
     return canonicalizer->getKind() == SpecialOperationKind::OP_Transpose;
   }
@@ -306,7 +331,7 @@ class ShapeCastCanonicalizer
       : SpecialOperationCanonicalizer<vector::ShapeCastOp>(
             candidateScOps, SpecialOperationKind::OP_ShapeCast, steps){};
   virtual ~ShapeCastCanonicalizer() {}
-  void prepareSpecialOperationInfo() override {}
+  void prepareSpecialInfo() {}
   static bool classof(SpecialOperationCanonicalizer *canonicalizer) {
     return canonicalizer->getKind() == SpecialOperationKind::OP_ShapeCast;
   }
diff --git a/include/gc/Transforms/Utils/VectorUtils.h b/include/gc/Transforms/Utils/VectorUtils.h
@@ -54,9 +54,8 @@ namespace gc {
 /// block.
 /// insert_slice just move them to the privious of the first operation which
 /// use it.
-void moveSomeInterferenceOperation(
-    func::FuncOp *func, MLIRContext *ctx,
-    std::function<bool(Operation *)> &conditionalFunc);
+void moveOpsFrontOrBack(func::FuncOp *func, MLIRContext *ctx,
+                        std::function<bool(Operation *)> &conditionalFunc);
 
 /// build a constant operation of index type
 Value makeIndexArithConstantOp(OpBuilder &opBuilder, const Location &loc,
diff --git a/lib/gc/Analysis/VectorBasedFusionAnalysis.cpp b/lib/gc/Analysis/VectorBasedFusionAnalysis.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 #include "gc/Analysis/VectorBasedFusionAnalysis.h"
-#include "gc/Dialect/Linalgx/Utils.h"
 
 namespace mlir {
 namespace gc {
@@ -397,17 +396,7 @@ int TypeHelper::generateValidSteps(int steps, VectorType type) {
 // Get the maximum number of current data types that a register can hold
 [[nodiscard]] int TypeHelper::getDataTypeMAXSIMDLength(VectorType type) {
   auto typebits = type.getElementTypeBitWidth();
-  const int favx512bits = 512;
-  const int favx2bits = 256;
-  if (info.favx512f)
-    return favx512bits / typebits;
-
-  if (info.favx2)
-    return favx2bits / typebits;
-
-  // invalid hardware
-  llvm_unreachable("Invalid hardware.");
-  return -1;
+  return info.vectorWidth / typebits;
 }
 
 /// Get a appropriate for loop step for current vector type
diff --git a/lib/gc/Transforms/CPUPhysicalRegisterPass.cpp b/lib/gc/Transforms/CPUPhysicalRegisterPass.cpp
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-#include "TilingVector.hpp"
+#include "gc/Transforms/TilingVector.h"
 
 namespace mlir {
 namespace gc {
@@ -1802,7 +1802,7 @@ bool MultiReductionCanonicalizer::hasLastDimReduction() {
   return res;
 }
 
-void MultiReductionCanonicalizer::prepareSpecialOperationInfo() {
+void MultiReductionCanonicalizer::prepareSpecialInfo() {
   if (getCandidateOps().empty())
     return;
 
@@ -2110,9 +2110,8 @@ void ForLoopGenerator::setOperationCorrectOperand(
         loopHelperParam
             .loopIterArgs[loopHelperParam.currentLoopStateIdxMap.at(loopArg)]);
   }
-  int offset = isa<vector::TransferWriteOp>(op) ? 2 : 1;
-  if (dyn_cast<vector::TransferWriteOp>(op) ||
-      dyn_cast<vector::TransferReadOp>(op)) {
+  int operandOffset = isa<vector::TransferWriteOp>(op) ? 2 : 1;
+  if (isReadOrWriteOperation(op)) {
     if (not opPermuationMap.contains(op))
       llvm_unreachable("Map must contains operation.");
 
@@ -2133,7 +2132,7 @@ void ForLoopGenerator::setOperationCorrectOperand(
       }
 
       ShapedType tensorType =
-          cast<ShapedType>(op->getOperandTypes()[offset - 1]);
+          cast<ShapedType>(op->getOperandTypes()[operandOffset - 1]);
       int64_t varIdx = dim;
       if (tensorType.getRank() >
           (int64_t)loopHelperParam.inductionVars.size()) {
@@ -2146,11 +2145,12 @@ void ForLoopGenerator::setOperationCorrectOperand(
       }
       if (loopHelperParam.indiceLoopMap.contains(op))
         op->setOperand(
-            dim + offset,
+            dim + operandOffset,
             loopHelperParam
                 .inductionVars[loopHelperParam.indiceLoopMap[op][varIdx]]);
       else
-        op->setOperand(dim + offset, loopHelperParam.inductionVars[varIdx]);
+        op->setOperand(dim + operandOffset,
+                       loopHelperParam.inductionVars[varIdx]);
     }
     if (auto readOp = dyn_cast<vector::TransferReadOp>(op)) {
       size_t grpIdx = getVectorBasedFusion().getOpGroupIndexMap()[op];
@@ -2780,8 +2780,8 @@ void GroupOperationFusionImpl::broadcastFromElements(Operation *op,
           op->getLoc(), newOperandType, op->getOperands()[0]);
       removeOpInCurrentGroups(grpIdx, op, bcastOp);
       std::function<bool(Operation *)> candidateFunc = isBroadcastOp;
-      moveSomeInterferenceOperation(&getGroupOperationFusion().getFunction(),
-                                    op->getContext(), candidateFunc);
+      moveOpsFrontOrBack(&getGroupOperationFusion().getFunction(),
+                         op->getContext(), candidateFunc);
     }
   }
 }
@@ -2946,22 +2946,21 @@ struct CPUPhysicalRegisterPass
     }
     // affineApply operation is always used by other operations.
     std::function<bool(Operation *)> candidateFunc = isProducerOp;
-    moveSomeInterferenceOperation(&func, ctx, candidateFunc);
+    moveOpsFrontOrBack(&func, ctx, candidateFunc);
     candidateFunc = isCandidateMoveOperations;
-    moveSomeInterferenceOperation(&func, ctx, candidateFunc);
+    moveOpsFrontOrBack(&func, ctx, candidateFunc);
     // canonicalize vector operation, default use vector-based fusion
     // strategy.
     HardWareInfo hwInfo;
     CPUTargetDescriptionAnalysis sysDesc =
         getAnalysis<CPUTargetDescriptionAnalysis>();
-    hwInfo.favx512f = sysDesc.getMaxVectorWidth() >= 512;
-    hwInfo.favx2 = sysDesc.getMaxVectorWidth() >= 256;
+    hwInfo.vectorWidth = sysDesc.getMaxVectorWidth();
     VectorOperationCanonicalizer canonicalizer(
         func, hwInfo, CanonicalizerKind::GroupOperations);
     canonicalizer.run();
 
     candidateFunc = isReadOrWriteOperation;
-    moveSomeInterferenceOperation(&func, ctx, candidateFunc);
+    moveOpsFrontOrBack(&func, ctx, candidateFunc);
 
     // transpose kernel
     vector::VectorTransformsOptions transposeOptions =
diff --git a/lib/gc/Transforms/Utils/VectorUtils.cpp b/lib/gc/Transforms/Utils/VectorUtils.cpp
@@ -154,9 +154,8 @@ void moveCandidateOperation(
 // block.
 // insert_slice just move them to the privious of the first operation which
 // use it.
-void moveSomeInterferenceOperation(
-    func::FuncOp *func, MLIRContext *ctx,
-    std::function<bool(Operation *)> &conditionalFunc) {
+void moveOpsFrontOrBack(func::FuncOp *func, MLIRContext *ctx,
+                        std::function<bool(Operation *)> &conditionalFunc) {
   // Pre-order traversal of each op
   // Record each operation position. Inorder to we can kown current operation
   // should move after which operation.
diff --git a/test/mlir/test/gc/Transforms/cpu-physical-register.mlir b/test/mlir/test/gc/Transforms/cpu-physical-register.mlir
@@ -664,3 +664,47 @@ func.func @add_small_tensor_test14(%arg0: tensor<2xf32>, %arg1: tensor<2xf32>) -
   %2 = linalg.max ins(%1, %cst : tensor<2xf32>, tensor<2xf32>) outs(%0: tensor<2xf32>) -> tensor<2xf32>
   return %2 : tensor<2xf32>
 }
+
+// CHECK-LABEL: func @broadcast_add_test15
+// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[C1:.*]] = arith.constant 1 : index
+// CHECK: %[[C64:.*]] = arith.constant 64 : index
+// CHECK: %[[C16:.*]] = arith.constant 16 : index
+// CHECK: scf.for %[[arg2:.*]] = %[[C0]] to %[[C64]] step %[[C1]] iter_args(%[[arg3:.*]] = {{.*}}) -> (tensor<64x64xf32>)
+// CHECK: scf.for %[[arg4:.*]] = %[[C0]] to %[[C64]] step %[[C16]] iter_args(%[[arg5:.*]] = %[[arg3]]) -> (tensor<64x64xf32>)
+// CHECK: %[[READ0:.*]] = vector.transfer_read %[[arg5]][%[[arg2]], %[[arg4]]], %[[CST]] {in_bounds = [true]} : tensor<64x64xf32>, vector<16xf32>
+// CHECK: %[[READ1:.*]] = vector.transfer_read %arg0[%[[arg4]]], %[[CST]] {in_bounds = [true]} : tensor<64xf32>, vector<16xf32>
+// CHECK: %[[ADD0:.*]] = arith.addf %[[READ1]], %[[READ0]] : vector<16xf32>
+// CHECK: %[[WRITE:.*]] = vector.transfer_write %[[ADD0]], %[[arg5]][%[[arg2]], %[[arg4]]] {in_bounds = [true]} : vector<16xf32>, tensor<64x64xf32>
+func.func @broadcast_add_test15(%arg0: tensor<64xf32>, %arg1: tensor<64x64xf32>) -> tensor<64x64xf32> {
+  %0 = tensor.empty() : tensor<64x64xf32>
+  %bcast = linalg.broadcast
+      ins(%arg0:tensor<64xf32>)
+      outs(%0:tensor<64x64xf32>)
+      dimensions = [0]
+  %out3 = linalg.add ins(%bcast, %arg1: tensor<64x64xf32>, tensor<64x64xf32>) 
+  outs(%arg1: tensor<64x64xf32>) -> tensor<64x64xf32>
+  return %out3: tensor<64x64xf32>
+}
+
+// CHECK-LABEL: func @broadcast_single_test16
+// CHECK: %[[C16:.*]] = arith.constant 16 : index
+// CHECK: %[[C64:.*]] = arith.constant 64 : index
+// CHECK: %[[C1:.*]] = arith.constant 1 : index
+// CHECK: %[[C0:.*]] = arith.constant 0 : index
+// CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK: %[[EMPTY0:.*]] = tensor.empty() : tensor<64x64xf32>
+// CHECK: scf.for %[[arg1:.*]] = %[[C0]] to %[[C64]] step %[[C1]] iter_args(%[[arg2:.*]] = %[[EMPTY0]]) -> (tensor<64x64xf32>)
+// CHECK: scf.for %[[arg3:.*]] = %[[C0]] to %[[C64]] step %[[C16]] iter_args(%[[arg4:.*]] = %[[arg2]]) -> (tensor<64x64xf32>)
+// CHECK: %[[READ0:.*]] = vector.transfer_read %arg0[%[[arg3]]], %[[CST]] {in_bounds = [true]} : tensor<64xf32>, vector<16xf32>
+// CHECK: %[[WRITE0:.*]] = vector.transfer_write %[[READ0]], %[[arg4]][%[[arg1]], %[[arg3]]] {in_bounds = [true]} : vector<16xf32>, tensor<64x64xf32> 
+func.func @broadcast_single_test16(%arg0: tensor<64xf32>) -> tensor<64x64xf32> {
+  %0 = tensor.empty() : tensor<64x64xf32>
+  %bcast = linalg.broadcast
+      ins(%arg0: tensor<64xf32>)
+      outs(%0:tensor<64x64xf32>)
+      dimensions = [0]
+  return %bcast: tensor<64x64xf32>
+}
+