support partial reduction

zhczhong · zhczhong · commit da852fd30b3f · 2024-05-30T20:00:54.000-07:00
diff --git a/include/gc/Dialect/Arith/Utils/EasyBuild.h b/include/gc/Dialect/Arith/Utils/EasyBuild.h
@@ -1,17 +1,10 @@
-//===- EasyBuild.h - Easy Arith IR Builder utilities ------------*- C++ -*-===//
+//===-- EasyBuild.h - DESC --------------------------------------*- C++ -*-===//
 //
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// This header file defines the easy-build utilities for arith dialects. It
-// provides the utility functions, classes and operators to make it easir to
-// program arith dialect operations in C++
-//
-//===----------------------------------------------------------------------===//
-
 #ifndef MLIR_DIALECT_ARITH_UTILS_EASYBUILD_H
 #define MLIR_DIALECT_ARITH_UTILS_EASYBUILD_H
 #include "gc/IR/EasyBuild.h"
@@ -28,12 +21,8 @@ namespace impl {
 
 template <std::size_t size> struct ToFloatType {};
 
-template <> struct ToFloatType<4> {
-  using type = Float32Type;
-};
-template <> struct ToFloatType<8> {
-  using type = Float64Type;
-};
+template <> struct ToFloatType<4> { using type = Float32Type; };
+template <> struct ToFloatType<8> { using type = Float64Type; };
 
 inline Type getElementType(Value v) {
   auto type = v.getType();
diff --git a/include/gc/IR/EasyBuild.h b/include/gc/IR/EasyBuild.h
@@ -1,16 +1,10 @@
-//===- EasyBuild.h - Easy IR Builder utilities ------------------*- C++ -*-===//
+//===-- EasyBuild.h - DESC --------------------------------------*- C++ -*-===//
 //
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// This header file defines the easy-build utilities core data structures for
-// building IR.
-//
-//===----------------------------------------------------------------------===//
-
 #ifndef MLIR_IR_EASYBUILD_H
 #define MLIR_IR_EASYBUILD_H
 #include "mlir/IR/Builders.h"
diff --git a/include/gc/IR/EasyBuildSCF.h b/include/gc/IR/EasyBuildSCF.h
@@ -1,10 +1,11 @@
-//===- EasyBuildSCF.h - Easy IR Builder for general control flow *- C++ -*-===//
+//===-- EasyBuildSCF.h - DESC -----------------------------------*- C++ -*-===//
 //
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 //
 // This header file defines the helper classes, functions and macros to help to
 // build general structured control flow. Developers can use the utilities in
diff --git a/lib/gc/Transforms/DeepTileContractionNamedOp.cpp b/lib/gc/Transforms/DeepTileContractionNamedOp.cpp
@@ -1,9 +1,9 @@
 //===-- DeepTileContractionNamedOp.cpp - DESC -------------------*- C++ -*-===//
-// 
+//
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// 
+//
 //===----------------------------------------------------------------------===//
 
 #include "./Tiling.hpp"
@@ -273,9 +273,19 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp,
       b.setInsertionPoint(currentOp);
       if (auto partialInterface =
               dyn_cast<PartialReductionOpInterface>(currentOp.getOperation())) {
+        for (auto [idx, tile] : llvm::enumerate(tileSizes)) {
+          if (isConstantIntValue(tile, 0)) {
+            tileSizes[idx] = loopRanges[idx].size;
+          }
+        }
+
+        SmallVector<OpFoldResult> newParallelDims;
+        for (auto i = 0UL; i < reductionDims.size(); i++) {
+          newParallelDims.push_back(getAsIndexOpFoldResult(b.getContext(), i));
+        }
         auto tilingResult = linalgX::tileAllUsingForall(
-            b, cast<PartialReductionOpInterface>(currentOp.getOperation()),
-            numThreads, tileSizes, std::nullopt);
+            b, cast<PartialReductionOpInterface>(currentOp.getOperation()), {},
+            tileSizes, newParallelDims, std::nullopt);
         if (failed(tilingResult))
           return failure();
         currentOp = dyn_cast<linalg::LinalgOp>(tilingResult->parallelTiledOp);
diff --git a/lib/gc/Transforms/Tiling.cpp b/lib/gc/Transforms/Tiling.cpp
@@ -794,11 +794,10 @@ FailureOr<TiledLinalgOp> static tileLinalgOpImpl(
   return tileLinalgOpImpl<LoopTy>(b, op, tileSizeVector, options);
 }
 
-FailureOr<linalg::ForallReductionTilingResult>
-tileAllUsingForall(RewriterBase &b, PartialReductionOpInterface op,
-                   ArrayRef<OpFoldResult> numThreads,
-                   ArrayRef<OpFoldResult> tileSizes,
-                   std::optional<ArrayAttr> mapping) {
+FailureOr<linalg::ForallReductionTilingResult> tileAllUsingForall(
+    RewriterBase &b, PartialReductionOpInterface op,
+    ArrayRef<OpFoldResult> threadNums, ArrayRef<OpFoldResult> tileSizes,
+    ArrayRef<OpFoldResult> newParallelDims, std::optional<ArrayAttr> mapping) {
   Location loc = op.getLoc();
   OpBuilder::InsertionGuard g(b);
 
@@ -834,6 +833,24 @@ tileAllUsingForall(RewriterBase &b, PartialReductionOpInterface op,
     if (iteratorType == utils::IteratorType::reduction)
       redDims.push_back(idx);
   }
+
+  SmallVector<OpFoldResult> numThreads(threadNums.begin(), threadNums.end());
+  if (numThreads.empty()) {
+    SmallVector<Range> loopRanges = tilingInterfaceOp.getIterationDomain(b);
+    unsigned nLoops = loopRanges.size();
+    numThreads.reserve(nLoops);
+    AffineExpr s0, s1;
+    bindSymbols(b.getContext(), s0, s1);
+    AffineExpr divExpr = s0.ceilDiv(s1);
+    for (const auto &it : llvm::zip(tileSizes, loopRanges)) {
+      OpFoldResult numTiles = std::get<0>(it);
+      if (!isConstantIntValue(numTiles, 0))
+        numTiles = makeComposedFoldedAffineApply(
+            b, op.getLoc(), divExpr, {std::get<1>(it).size, std::get<0>(it)});
+      numThreads.push_back(numTiles);
+    }
+  }
+
   bool hasReductionThreads = false;
   for (auto dim : redDims) {
     if (!isConstantIntValue(numThreads[dim], 0) &&
@@ -850,13 +867,24 @@ tileAllUsingForall(RewriterBase &b, PartialReductionOpInterface op,
   if ((unsigned)redDims.front() >= numThreads.size())
     return b.notifyMatchFailure(
         op, "reduction dimension must be mapped to threads");
-
+  SmallVector<int> constantNewParallelDims;
+  for (auto dim : newParallelDims) {
+    if (getConstantIntValue(dim) == std::nullopt)
+      return b.notifyMatchFailure(
+          op, "Expected new parallel dims to be constant integers.");
+    constantNewParallelDims.push_back(*getConstantIntValue(dim));
+  }
+  if (newParallelDims.empty())
+    constantNewParallelDims = redDims;
+  if (constantNewParallelDims.size() != redDims.size())
+    return b.notifyMatchFailure(
+        op, "reduction dimension must be mapped to new parallel dims");
   // 1. Create the inital tensor value.
   FailureOr<Operation *> identityTensor = nullptr;
   if (hasReductionThreads) {
     identityTensor = LinalgOpPartialReductionInterface::
-        generateInitialTensorForPartialReduction(op, b, loc, numThreads,
-                                                 redDims, {});
+        generateInitialTensorForPartialReduction(
+            op, b, loc, numThreads, redDims, constantNewParallelDims);
   }
   if (failed(identityTensor))
     return b.notifyMatchFailure(op,
@@ -866,7 +894,6 @@ tileAllUsingForall(RewriterBase &b, PartialReductionOpInterface op,
   SmallVector<Value> dest;
   if (failed(tensor::getOrCreateDestinations(b, loc, op, dest)))
     return b.notifyMatchFailure(op, "failed to get destination tensors");
-
   Operation *tiledOp = nullptr;
 
   SmallVector<OpFoldResult> nonZeroNumThreads =
@@ -875,20 +902,21 @@ tileAllUsingForall(RewriterBase &b, PartialReductionOpInterface op,
       }));
   SmallVector<Value> materializedNonZeroNumThreads =
       getValueOrCreateConstantIndexOp(b, loc, nonZeroNumThreads);
-
   // 2. Create the ForallOp with an empty region.
   scf::ForallOp forallOp = b.create<scf::ForallOp>(
       loc, getAsOpFoldResult(materializedNonZeroNumThreads),
       hasReductionThreads ? (*identityTensor)->getResults() : dest, mapping);
-
   // 3. Calculate the tile offsets and sizes for the subsequent loop that will
   // be nested under `forallOp`.
   SmallVector<OpFoldResult> tiledOffsets, tiledSizes;
+  std::optional<ArrayRef<OpFoldResult>> nominalTileSizes = std::nullopt;
+  if (!tileSizes.empty() && threadNums.empty()) {
+    nominalTileSizes = tileSizes;
+  }
   calculateTileOffsetsAndSizes(b, loc, forallOp, numThreads, iterationDomain,
                                /*omitTileOffsetBoundsCheck =*/false,
-                               /*nominalTileSizes=*/tileSizes, tiledOffsets,
-                               tiledSizes);
-
+                               /*nominalTileSizes=*/nominalTileSizes,
+                               tiledOffsets, tiledSizes);
   // 4. Clone the tileable op and update its destination operands to use the
   // output bbArgs of the ForallOp.
   SmallVector<Value> tilingResults;
@@ -907,20 +935,26 @@ tileAllUsingForall(RewriterBase &b, PartialReductionOpInterface op,
         SmallVector<OpFoldResult> strides(numThreads.size(), b.getIndexAttr(1));
         SmallVector<OpFoldResult> outOffsets(numThreads.size(),
                                              b.getIndexAttr(0));
-        SmallVector<OpFoldResult> sizes;
-        for (auto s :
-             cast<RankedTensorType>(destBbArgs[destNum].getType()).getShape()) {
-          sizes.emplace_back(getAsIndexOpFoldResult(b.getContext(), (int)s));
-        }
-        for (auto dim : redDims) {
-          sizes[dim] = b.getIndexAttr(1);
+        SmallVector<OpFoldResult> sizes = tiledSizes;
+        for (const auto &iteratorType : llvm::enumerate(
+                 cast<RankedTensorType>(destBbArgs[destNum].getType())
+                     .getShape())) {
+          sizes[iteratorType.index()] =
+              getAsIndexOpFoldResult(b.getContext(), iteratorType.value());
+          if (llvm::find(constantNewParallelDims, iteratorType.index()) !=
+              constantNewParallelDims.end()) {
+            sizes[iteratorType.index()] = b.getIndexAttr(1);
+          }
         }
 
         auto nonZeroDimIdx = 0;
-        for (auto dim = 0UL; dim < numThreads.size(); dim++) {
-          if (!isConstantIntValue(numThreads[dim], 0)) {
-            if (llvm::find(redDims, dim) != redDims.end())
-              outOffsets[dim] = forallOp.getInductionVars()[nonZeroDimIdx];
+        auto currentReductionIdx = 0;
+        for (const auto &iteratorType : llvm::enumerate(numThreads)) {
+          if (!isConstantIntValue(iteratorType.value(), 0)) {
+            if (llvm::find(redDims, iteratorType.index()) != redDims.end()) {
+              outOffsets[constantNewParallelDims[currentReductionIdx++]] =
+                  forallOp.getInductionVars()[nonZeroDimIdx];
+            }
             nonZeroDimIdx++;
           }
         }
@@ -929,7 +963,10 @@ tileAllUsingForall(RewriterBase &b, PartialReductionOpInterface op,
             loc, cast<RankedTensorType>(initOperand.getType()),
             destBbArgs[destNum], outOffsets, sizes, strides));
       } else {
-        tiledDpsInitOperands.push_back(initOperand);
+        auto *it = llvm::find(dest, initOperand);
+        assert(it != dest.end() && "dest operand not found in dest");
+        unsigned destNum = std::distance(dest.begin(), it);
+        tiledDpsInitOperands.push_back(destBbArgs[destNum]);
       }
     }
 
@@ -944,19 +981,35 @@ tileAllUsingForall(RewriterBase &b, PartialReductionOpInterface op,
         initOperandPtr.set(tiledInitValue);
       }
     });
-
     // 5. Tile the cloned op and delete the clone.
-    FailureOr<TilingResult> tilingResult =
-        cast<TilingInterface>(clonedOp).getTiledImplementation(b, tiledOffsets,
-                                                               tiledSizes);
-    if (failed(tilingResult))
-      return clonedOp->emitError("Failed to tile op: ");
-    if (tilingResult->tiledOps.size() != 1) {
-      return clonedOp->emitError("expected a single produced tiled op, got ")
-             << tilingResult->tiledOps.size();
+    if (tileSizes.empty() || threadNums.empty()) {
+      FailureOr<TilingResult> tilingResult =
+          cast<TilingInterface>(clonedOp).getTiledImplementation(
+              b, tiledOffsets, tiledSizes);
+      if (failed(tilingResult))
+        return clonedOp->emitError("Failed to tile op: ");
+      if (tilingResult->tiledOps.size() != 1) {
+        return clonedOp->emitError("expected a single produced tiled op, got ")
+               << tilingResult->tiledOps.size();
+      }
+      tiledOp = tilingResult->tiledOps.front();
+      tilingResults = tilingResult->tiledValues;
+    } else {
+      LinalgTilingOptions options;
+      FailureOr<TiledLinalgOp> maybeTiled = tileLinalgOpImpl<scf::ForOp>(
+          b, cast<LinalgOp>(clonedOp), tileSizes, options);
+      if (failed(maybeTiled))
+        return b.notifyMatchFailure(op, "failed tileLinalgOpImpl");
+
+      SmallVector<Value> ids = forallOp.getInductionVars();
+      mapLoopToProcessorIds(cast<scf::ForOp>(maybeTiled->loops.back()), ids,
+                            materializedNonZeroNumThreads);
+      if (maybeTiled->loops.size() != 1) {
+        return clonedOp->emitError("expected a single produced loop");
+      }
+      tiledOp = maybeTiled->op;
+      tilingResults = maybeTiled->loops.front()->getResults();
     }
-    tiledOp = tilingResult->tiledOps.front();
-    tilingResults = tilingResult->tiledValues;
 
     b.eraseOp(clonedOp);
   }
@@ -974,23 +1027,33 @@ tileAllUsingForall(RewriterBase &b, PartialReductionOpInterface op,
       return op->emitOpError("output offsets couldn't be calculated");
     SmallVector<OpFoldResult> resultOffsetsRank, resultSizesRank;
     int64_t offIdx = 0;
-    int64_t sizeIdx = 0;
     int64_t nonZeroDimIdx = 0;
+    SmallVector<Value> reductionInductionVars;
     for (auto i = 0UL; i < numThreads.size(); ++i) {
-      if (llvm::find(redDims, i) != redDims.end()) {
+      if (llvm::find(constantNewParallelDims, i) !=
+          constantNewParallelDims.end()) {
         if (hasReductionThreads) {
-          resultOffsetsRank.push_back(
-              forallOp.getInductionVars()[nonZeroDimIdx]);
+          resultOffsetsRank.push_back(b.getIndexAttr(1));
           resultSizesRank.push_back(b.getIndexAttr(1));
         }
-        nonZeroDimIdx++;
-        continue;
+      } else {
+        resultOffsetsRank.push_back(resultOffsets[offIdx]);
+        resultSizesRank.push_back(resultSizes[offIdx++]);
+      }
+      if (llvm::find(redDims, i) != redDims.end()) {
+        reductionInductionVars.push_back(
+            forallOp.getInductionVars()[nonZeroDimIdx]);
       }
       if (!isConstantIntValue(numThreads[i], 0)) {
         nonZeroDimIdx++;
       }
-      resultOffsetsRank.push_back(resultOffsets[offIdx++]);
-      resultSizesRank.push_back(resultSizes[sizeIdx++]);
+    }
+    if (hasReductionThreads) {
+      for (auto [parallelDims, redVar] :
+           llvm::zip(constantNewParallelDims, reductionInductionVars)) {
+        resultOffsetsRank[parallelDims] = redVar;
+        resultSizesRank[parallelDims] = b.getIndexAttr(1);
+      }
     }
     SmallVector<OpFoldResult> strides(resultSizesRank.size(),
                                       b.getIndexAttr(1));
@@ -1001,18 +1064,16 @@ tileAllUsingForall(RewriterBase &b, PartialReductionOpInterface op,
     b.create<tensor::ParallelInsertSliceOp>(
         loc, result, bbArg, resultOffsetsRank, resultSizesRank, strides);
   }
-
   // 7. Merge the partial reductions.
   Operation *mergeOp = nullptr;
   b.setInsertionPointAfter(forallOp);
   if (hasReductionThreads) {
-    Operation *mergeOp =
-        op.mergeReductions(b, loc, forallOp->getResults(), redDims);
+    Operation *mergeOp = op.mergeReductions(b, loc, forallOp->getResults(),
+                                            constantNewParallelDims);
     b.replaceOp(op, mergeOp->getResults());
   } else {
     b.replaceOp(op, forallOp->getResults());
   }
-
   // 8. Return.
   ForallReductionTilingResult results;
   results.initialOp = *identityTensor;
diff --git a/lib/gc/Transforms/Tiling.hpp b/lib/gc/Transforms/Tiling.hpp
@@ -44,11 +44,10 @@ FailureOr<linalg::ForallReductionTilingResult> tileReductionUsingForall(
     ArrayRef<OpFoldResult> threadNums, ArrayRef<OpFoldResult> tileSizes,
     ArrayRef<OpFoldResult> newParallelDims, std::optional<ArrayAttr> mapping);
 
-FailureOr<linalg::ForallReductionTilingResult>
-tileAllUsingForall(RewriterBase &b, PartialReductionOpInterface op,
-                   ArrayRef<OpFoldResult> numThreads,
-                   ArrayRef<OpFoldResult> tileSizes,
-                   std::optional<ArrayAttr> mapping);
+FailureOr<linalg::ForallReductionTilingResult> tileAllUsingForall(
+    RewriterBase &b, PartialReductionOpInterface op,
+    ArrayRef<OpFoldResult> numThreads, ArrayRef<OpFoldResult> tileSizes,
+    ArrayRef<OpFoldResult> newParallelDims, std::optional<ArrayAttr> mapping);
 
 } // namespace linalgX
 } // namespace mlir