replace generic op with named op

zhczhong · zhczhong · commit 9af3f965b894 · 2024-06-13T01:40:31.000-07:00
diff --git a/lib/gc/Transforms/DeepTileContractionNamedOp.cpp b/lib/gc/Transforms/DeepTileContractionNamedOp.cpp
@@ -8,6 +8,7 @@
 
 #include "./Tiling.hpp"
 #include "gc/Dialect/Arith/Utils/EasyBuild.h"
+#include "gc/Dialect/Linalgx/LinalgxOps.h"
 #include "gc/IR/EasyBuild.h"
 #include "gc/IR/EasyBuildSCF.h"
 #include "mlir/AsmParser/AsmParser.h"
@@ -68,24 +69,23 @@ getOprandDimType(linalg::LinalgOp &linalgOp) {
         SmallVector<DimType>{DimType::M, DimType::K},
         SmallVector<DimType>{DimType::K, DimType::N},
         SmallVector<DimType>{DimType::M, DimType::N}};
-  } else if (isa<linalg::GenericOp>(linalgOp)) {
-    auto iteratorTypes = linalgOp.getIteratorTypesArray();
-    if (iteratorTypes.size() == 7UL) {
-      // 4Dx5D, brgemm vnni
-      return SmallVector<SmallVector<DimType>>{
-          SmallVector<DimType>{DimType::M, DimType::K, DimType::M, DimType::K},
-          SmallVector<DimType>{DimType::N, DimType::K, DimType::K, DimType::N,
-                               DimType::K},
-          SmallVector<DimType>{DimType::M, DimType::N, DimType::M, DimType::N}};
-    } else if (iteratorTypes.size() == 6UL) {
-      // 4Dx4D
-      return SmallVector<SmallVector<DimType>>{
-          SmallVector<DimType>{DimType::M, DimType::K, DimType::M, DimType::K},
-          SmallVector<DimType>{DimType::N, DimType::K, DimType::K, DimType::N},
-          SmallVector<DimType>{DimType::M, DimType::N, DimType::M, DimType::N}};
-    }
-  } else {
-    return failure();
+  } else if (llvm::isa<linalgx::Mm2DVnniOp>(linalgOp)) {
+    return SmallVector<SmallVector<DimType>>{
+        SmallVector<DimType>{DimType::M, DimType::K},
+        SmallVector<DimType>{DimType::N, DimType::K, DimType::K, DimType::N,
+                             DimType::K},
+        SmallVector<DimType>{DimType::M, DimType::N, DimType::M, DimType::N}};
+  } else if (llvm::isa<linalgx::Mm4DVnniOp>(linalgOp)) {
+    return SmallVector<SmallVector<DimType>>{
+        SmallVector<DimType>{DimType::M, DimType::K, DimType::M, DimType::K},
+        SmallVector<DimType>{DimType::N, DimType::K, DimType::K, DimType::N,
+                             DimType::K},
+        SmallVector<DimType>{DimType::M, DimType::N, DimType::M, DimType::N}};
+  } else if (llvm::isa<linalg::BatchMatmulOp>(linalgOp)) {
+    return SmallVector<SmallVector<DimType>>{
+        SmallVector<DimType>{DimType::Batch, DimType::M, DimType::K},
+        SmallVector<DimType>{DimType::Batch, DimType::K, DimType::N},
+        SmallVector<DimType>{DimType::Batch, DimType::M, DimType::N}};
   }
   return failure();
 }
@@ -136,7 +136,7 @@ MatmulConfig getDefaultMatmulConfig(linalg::LinalgOp &linalgOp) {
   cfg.KBlock = 64;
   cfg.MThreads = 2;
   cfg.NThreads = 2;
-  cfg.KThreads = 1;
+  cfg.KThreads = 2;
   return cfg;
 }
 
@@ -784,8 +784,9 @@ struct deepTileMatmul : public OpInterfaceRewritePattern<linalg::LinalgOp> {
           ValueRange{dataOprand, weightOprand}, resultOprand);
     } else {
       IRMapping mapping;
-      matmul = dyn_cast<linalg::LinalgOp>(
-          *rewriter.clone(*(currentOp.getOperation())));
+      matmul = rewriter.create<linalgx::BatchReduceMatmulVnniOp>(
+          resultOprand.getLoc(), resultOprand.getType(),
+          ValueRange{dataOprand, weightOprand}, resultOprand);
     }
     Value result = matmul.getOperation()->getResult(0);
 
@@ -830,18 +831,32 @@ struct deepTileMatmul : public OpInterfaceRewritePattern<linalg::LinalgOp> {
     return success();
   }
 
+  bool checkLinalgMatmulType(linalg::LinalgOp linalgOp) const {
+    return llvm::isa<linalg::MatmulOp>(linalgOp) ||
+           llvm::isa<linalgx::Mm2DVnniOp>(linalgOp) ||
+           llvm::isa<linalgx::Mm4DVnniOp>(linalgOp) ||
+           llvm::isa<linalgx::MultiBatchMatmulOp>(linalgOp) ||
+           llvm::isa<linalg::BatchMatmulOp>(linalgOp);
+  }
+
   LogicalResult matchAndRewrite(linalg::LinalgOp linalgOp,
                                 PatternRewriter &rewriter) const override {
+    if (!checkLinalgMatmulType(linalgOp))
+      return failure();
     if (linalgOp.hasPureBufferSemantics())
       return failure();
-    OpBuilder::InsertionGuard guard(rewriter);
-    rewriter.setInsertionPoint(linalgOp);
+
     if (linalgOp.getOperation()->getParentOfType<scf::ForallOp>() ||
         !linalgOp || linalgOp.getNumDpsInputs() != 2)
       return failure();
-    Operation *fillOp = findParentFillOp(linalgOp.getDpsInits()[0]);
+
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(linalgOp);
     linalg::LinalgOp originOp =
         dyn_cast<linalg::LinalgOp>(*rewriter.clone(*(linalgOp.getOperation())));
+    linalgOp = *linalg::generalizeNamedOp(rewriter, linalgOp);
+    Operation *fillOp = findParentFillOp(linalgOp.getDpsInits()[0]);
+
     // Step 1. generate the outer loop
     MatmulConfig cfg = getDefaultMatmulConfig(linalgOp);
     auto outerLoopResult = outerLoopGeneration(rewriter, linalgOp, cfg,
diff --git a/test/gc/Transform/deepTileContractionNamedOp.mlir b/test/gc/Transform/deepTileContractionNamedOp.mlir
@@ -1,26 +1,21 @@
 // RUN: gc-opt --split-input-file --deep-tile-contraction-named-op %s
 
-// -----
+// // -----
 
-/// CHECK-LABEL: @blocked_matmul_f32
-func.func @blocked_matmul_f32(%arg0: tensor<128x128x32x32xf32>) -> tensor<128x128x32x32xf32> {
-    %cst = arith.constant dense<1.000000e+00> : tensor<128x128x32x32xf32>
-    %cst_0 = arith.constant 0.000000e+00 : f32
-    %0 = tensor.empty() : tensor<128x128x32x32xf32>
-    %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<128x128x32x32xf32>) -> tensor<128x128x32x32xf32>
-    %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d2, d3, d5)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>, affine_map<(d0, d1, d2, d3, d4, d5) -> (d0, d1, d3, d4)>], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %cst : tensor<128x128x32x32xf32>, tensor<128x128x32x32xf32>) outs(%1 : tensor<128x128x32x32xf32>) {
-    ^bb0(%in: f32, %in_1: f32, %out: f32):
-        %3 = arith.mulf %in, %in_1 : f32
-        %4 = arith.addf %out, %3 : f32
-        linalg.yield %4 : f32
-    } -> tensor<128x128x32x32xf32>
-    return %2 : tensor<128x128x32x32xf32>
-}
+// /// CHECK-LABEL: @matmul_4Dx4D_f32
+// func.func @matmul_4Dx4D_f32(%arg0: tensor<128x128x32x32xf32>) -> tensor<128x128x32x32xf32> {
+//     %cst = arith.constant dense<1.000000e+00> : tensor<128x128x32x32x1xf32>
+//     %cst_0 = arith.constant 0.000000e+00 : f32
+//     %0 = tensor.empty() : tensor<128x128x32x32xf32>
+//     %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<128x128x32x32xf32>) -> tensor<128x128x32x32xf32>
+//     %2 = linalgx.mm4d_vnni ins(%arg0, %cst : tensor<128x128x32x32xf32>, tensor<128x128x32x32x1xf32>) outs(%1 : tensor<128x128x32x32xf32>)  -> tensor<128x128x32x32xf32>
+//     return %2 : tensor<128x128x32x32xf32>
+// }
 
 // -----
 
-/// CHECK-LABEL: @plain_matmul_f32
-func.func @plain_matmul_f32(%arg0: tensor<4096x4096xf32>) -> tensor<4096x4096xf32> {
+/// CHECK-LABEL: @matmul_2Dx2D_f32
+func.func @matmul_2Dx2D_f32(%arg0: tensor<4096x4096xf32>) -> tensor<4096x4096xf32> {
     %cst = arith.constant dense<1.000000e+00> : tensor<4096x4096xf32>
     %cst_0 = arith.constant 0.000000e+00 : f32
     %0 = tensor.empty() : tensor<4096x4096xf32>
@@ -29,20 +24,39 @@ func.func @plain_matmul_f32(%arg0: tensor<4096x4096xf32>) -> tensor<4096x4096xf3
     return %2 : tensor<4096x4096xf32>
 }
 
+// // -----
+
+// /// CHECK-LABEL: @matmul_2Dx4D_f32
+// func.func @matmul_4Dx4D_f32(%arg0: tensor<4096x4096xf32>) -> tensor<4096x4096xf32> {
+//     %cst = arith.constant dense<1.000000e+00> : tensor<128x128x32x32x1xf32>
+//     %cst_0 = arith.constant 0.000000e+00 : f32
+//     %0 = tensor.empty() : tensor<4096x4096xf32>
+//     %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<4096x4096xf32>) -> tensor<4096x4096xf32>
+//     %2 = linalgx.mm2d_vnni ins(%arg0, %cst : tensor<4096x4096xf32>, tensor<128x128x32x32x1xf32>) outs(%1 : tensor<4096x4096xf32>)  -> tensor<4096x4096xf32>
+//     return %2 : tensor<4096x4096xf32>
+// }
+
 // -----
 
-/// CHECK-LABEL: @blocked_matmul_bf16
-func.func @blocked_matmul_bf16(%arg0: tensor<128x128x32x32xbf16>) -> tensor<128x128x32x32xbf16> {
+/// CHECK-LABEL: @matmul_4Dx4D_bf16
+func.func @matmul_4Dx4D_bf16(%arg0: tensor<128x128x32x32xbf16>) -> tensor<128x128x32x32xbf16> {
     %cst = arith.constant dense<1.000000e+00> : tensor<128x128x16x32x2xbf16>
     %cst_0 = arith.constant 0.000000e+00 : bf16
     %0 = tensor.empty() : tensor<128x128x32x32xbf16>
     %1 = linalg.fill ins(%cst_0 : bf16) outs(%0 : tensor<128x128x32x32xbf16>) -> tensor<128x128x32x32xbf16>
-    %2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d2, d4, d6)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d1, d2, d6 floordiv 2, d5, d3)>, affine_map<(d0, d1, d2, d3, d4, d5, d6) -> (d0, d1, d4, d5)>], iterator_types = ["parallel", "parallel", "reduction", "reduction", "parallel", "parallel", "reduction"]} ins(%arg0, %cst : tensor<128x128x32x32xbf16>, tensor<128x128x16x32x2xbf16>) outs(%1 : tensor<128x128x32x32xbf16>) {
-    ^bb0(%in: bf16, %in_1: bf16, %out: bf16):
-        %3 = arith.mulf %in, %in_1 : bf16
-        %4 = arith.addf %out, %3 : bf16
-        linalg.yield %4 : bf16
-    } -> tensor<128x128x32x32xbf16>
+    %2 = linalgx.mm4d_vnni ins(%arg0, %cst : tensor<128x128x32x32xbf16>, tensor<128x128x16x32x2xbf16>) outs(%1 : tensor<128x128x32x32xbf16>)  -> tensor<128x128x32x32xbf16>
     return %2 : tensor<128x128x32x32xbf16>
 }
 
+// // -----
+
+// /// CHECK-LABEL: @matmul_2Dx4D_bf16
+// func.func @matmul_4Dx4D_bf16(%arg0: tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16> {
+//     %cst = arith.constant dense<1.000000e+00> : tensor<128x128x16x32x2xbf16>
+//     %cst_0 = arith.constant 0.000000e+00 : bf16
+//     %0 = tensor.empty() : tensor<4096x4096xbf16>
+//     %1 = linalg.fill ins(%cst_0 : bf16) outs(%0 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
+//     %2 = linalgx.mm2d_vnni ins(%arg0, %cst : tensor<4096x4096xbf16>, tensor<128x128x16x32x2xbf16>) outs(%1 : tensor<4096x4096xbf16>)  -> tensor<4096x4096xbf16>
+//     return %2 : tensor<4096x4096xbf16>
+// }
+