deprecated tileToForallUsingTileSize

zhczhong · zhczhong · commit a205731ac236 · 2024-08-07T18:43:46.000-07:00
diff --git a/lib/gc/Transforms/DeepTileContractionNamedOp.cpp b/lib/gc/Transforms/DeepTileContractionNamedOp.cpp
@@ -168,11 +168,12 @@ static FailureOr<DtypeLegalizeResult>
 matmulDtypeLegalize(RewriterBase &rewriter, Operation *op,
                     bool needCopyInit = true, bool needFurtherFuse = false) {
   linalg::LinalgOp linalgOp = dyn_cast<linalg::LinalgOp>(op);
-  Location loc = linalgOp->getLoc();
-  DtypeLegalizeResult result;
   if (!linalgOp)
     return failure();
 
+  Location loc = linalgOp->getLoc();
+  DtypeLegalizeResult result;
+
   if (needToLegalizeDtype(linalgOp)) {
     rewriter.setInsertionPoint(linalgOp);
     IRMapping mapping;
@@ -449,15 +450,15 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
           }
         }
       } else {
-        TilingInterface tilingInterface =
-            cast<TilingInterface>(currentOp.getOperation());
-        FailureOr<linalg::ForallTilingResult> tilingResult =
-            linalg::tileToForallOpUsingTileSizes(b, tilingInterface, tileSizes,
-                                                 std::nullopt);
+        scf::SCFTilingOptions tileOption;
+        tileOption.setTileSizes(tileSizes);
+        tileOption.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp);
+        FailureOr<scf::SCFTilingResult> tilingResult = scf::tileUsingSCF(
+            b, cast<TilingInterface>(currentOp.getOperation()), tileOption);
         if (failed(tilingResult))
           return failure();
-        b.replaceOp(currentOp, tilingResult->tileOp);
-        currentOp = dyn_cast<linalg::LinalgOp>(tilingResult->tiledOp);
+        b.replaceOp(currentOp, tilingResult->replacements);
+        currentOp = dyn_cast<linalg::LinalgOp>(tilingResult->tiledOps.back());
       }
     }
   }
@@ -499,8 +500,8 @@ NOuterBlock: (PN + 1) * NOuterBlock] CSlice2 = CSlice[PK, PM * MOuterBlock: (PM
     for([om, on, ok]: [MNumBlock, NNumBlock, KNumBlock]) {
       ASlice2 = ASlice[om * MBlock: (om + 1) * MBlock, ok * KBlock: (ok + 1) *
 KBlock]
-      BSlice2 = BSlice[0, om * MBlock: (om + 1) * MBlock, ok * KBlock: (ok +
-1) * KBlock]
+      BSlice2 = BSlice[0, ok * KBlock: (ok + 1) * KBlock, on * NBlock: (on +
+1) * NBlock]
       CSlice3 = CSlice2[0, om * MBlock: (om + 1) * MBlock, on * NBlock:
 (on + 1) * NBlock] (init with 0 when ok == 0)
       MNumInnerBlock = MBlock / iim_block_
@@ -539,11 +540,13 @@ struct DeepTileMatmul : public OpInterfaceRewritePattern<linalg::LinalgOp> {
     size_t NFirstDim = *getConstantIntValue(loopRange[NDimPos[0]].size);
 
     size_t KParallelBlockSize =
-        KDimPos.size() > 1
-            ? llvm::divideCeil(KFirstDim, cfg.KThreads)
-            : llvm::divideCeil(llvm::divideCeil(KFirstDim, cfg.KBlock),
-                               cfg.KThreads) *
-                  cfg.KBlock;
+        cfg.KThreads == 1
+            ? 0
+            : (KDimPos.size() > 1
+                   ? llvm::divideCeil(KFirstDim, cfg.KThreads)
+                   : llvm::divideCeil(llvm::divideCeil(KFirstDim, cfg.KBlock),
+                                      cfg.KThreads) *
+                         cfg.KBlock);
     size_t MParallelBlockSize =
         MDimPos.size() > 1
             ? llvm::divideCeil(MFirstDim, cfg.MThreads)
diff --git a/lib/gc/Transforms/Pipeline.cpp b/lib/gc/Transforms/Pipeline.cpp
@@ -52,9 +52,10 @@ void populateTensorPasses(mlir::OpPassManager &pm) {
   // todo: layout propagation pass
   // todo: tensor constant propagation pass
   // linalg.matmul lowering to (scf.loop + linalg.brgemm) pass
-  pm.addNestedPass<func::FuncOp>(createIterativeTilingAndFusion());
-  // Fine-grain fusion pass
   pm.addNestedPass<func::FuncOp>(createDeepTileContractionNamedOp());
+
+  // Fine-grain fusion pass
+  pm.addNestedPass<func::FuncOp>(createIterativeTilingAndFusion());
   // todo: fine-grain fusion pass
   // todo: lower linalg to arith/math on virtual vector pass
 
diff --git a/test/mlir/test/gc/Transforms/deepTileContractionNamedOp.mlir b/test/mlir/test/gc/Transforms/deepTileContractionNamedOp.mlir
@@ -7,9 +7,9 @@ func.func @matmul_2Dx2D_f32(%arg0: tensor<4096x4096xf32>, %arg1: tensor<4096x409
     %cst_0 = arith.constant 0.000000e+00 : f32
     %0 = tensor.empty() : tensor<4096x4096xf32>
     %1 = linalg.fill ins(%cst_0 : f32) outs(%0 : tensor<4096x4096xf32>) -> tensor<4096x4096xf32>
-    // CHECK: scf.forall {{.*}} (4) {{.*}}  (tensor<4096x4096xf32>) {
+    // CHECK: scf.forall {{.*}} (0) to (4096) step (1024) {{.*}}  (tensor<4096x4096xf32>) {
     // CHECK: tensor.extract_slice {{.*}} [1024, 4096] [1, 1]
-    // CHECK: scf.forall {{.*}} (2) {{.*}}  (tensor<1024x4096xf32>)
+    // CHECK: scf.forall {{.*}} (0) to (4096) step (2048) {{.*}}  (tensor<1024x4096xf32>)
     // CHECK: tensor.extract_slice {{.*}} [1024, 2048] [1, 1]
     // CHECK: scf.for
     // CHECK: tensor.extract_slice {{.*}} [256, 2048] [1, 1]
@@ -43,9 +43,9 @@ func.func @matmul_4Dx4D_bf16(%arg0: tensor<128x128x32x32xbf16>, %arg1: tensor<12
     %0 = tensor.empty() : tensor<128x128x32x32xbf16>
     // CHECK-NOT: linalg.fill
     %1 = linalg.fill ins(%cst_0 : bf16) outs(%0 : tensor<128x128x32x32xbf16>) -> tensor<128x128x32x32xbf16>
-    // CHECK: scf.forall {{.*}} (16) {{.*}} (tensor<128x128x32x32xbf16>)
+    // CHECK: scf.forall {{.*}} (0) to (128) step (8) {{.*}} (tensor<128x128x32x32xbf16>)
     // CHECK: tensor.extract_slice {{.*}} [8, 128, 32, 32] [1, 1, 1, 1]
-    // CHECK: scf.forall {{.*}} (2) {{.*}} (tensor<8x128x32x32xbf16>)
+    // CHECK: scf.forall {{.*}} (0) to (128) step (64) {{.*}} (tensor<8x128x32x32xbf16>)
     // CHECK: tensor.extract_slice {{.*}} [8, 64, 32, 32] [1, 1, 1, 1]
     // CHECK: scf.for
     // CHECK: tensor.extract_slice {{.*}} [8, 8, 32, 32] [1, 1, 1, 1]
@@ -80,9 +80,9 @@ func.func @matmul_2Dx4D_bf16(%arg0: tensor<4096x4096xbf16>, %arg1: tensor<128x12
     %1 = linalg.fill ins(%cst_0 : bf16) outs(%0 : tensor<4096x4096xbf16>) -> tensor<4096x4096xbf16>
     // CHECK: scf.forall {{.*}} (2) {{.*}} (tensor<2x1x1x4096x4096xf32>)
     // CHECK: tensor.extract_slice {{.*}} [1, 1, 1, 4096, 4096] [1, 1, 1, 1, 1]
-    // CHECK: scf.forall {{.*}} (16) {{.*}} (tensor<4096x4096xf32>)
+    // CHECK: scf.forall {{.*}} (0) to (4096) step (256) {{.*}} (tensor<4096x4096xf32>)
     // CHECK: tensor.extract_slice {{.*}} [256, 4096] [1, 1]
-    // CHECK: scf.forall {{.*}} (2) {{.*}} (tensor<256x4096xf32>)
+    // CHECK: scf.forall {{.*}} (0) to (128) step (64) {{.*}} (tensor<256x4096xf32>)
     // CHECK: tensor.extract_slice {{.*}} [256, 2048] [1, 1]
     // CHECK: scf.for
     // CHECK: tensor.extract_slice {{.*}} [256, 256] [1, 1]