[AffineToStaticLogic] Flesh out support for converting to pipelines. (#…

…2387) This adds the necessary support to lower single affine loops to pipelines. Broadly, this encompasses lowering affine structures like if conditions into arithmetic, and building pipeline stages for each scheduled group.
llvm · Dec 24, 2021 · fba1ab7 · fba1ab7
1 parent 86b1a2a
commit fba1ab7
Show file tree

Hide file tree

Showing 5 changed files with 327 additions and 20 deletions.
diff --git a/include/circt/Conversion/Passes.td b/include/circt/Conversion/Passes.td
@@ -29,7 +29,10 @@ def AffineToStaticLogic : FunctionPass<"convert-affine-to-staticlogic"> {
   let constructor = "circt::createAffineToStaticLogic()";
   let dependentDialects = [
     "circt::staticlogic::StaticLogicDialect",
-    "mlir::arith::ArithmeticDialect"
+    "mlir::arith::ArithmeticDialect",
+    "mlir::memref::MemRefDialect",
+    "mlir::scf::SCFDialect",
+    "mlir::StandardOpsDialect"
   ];
 }
 

diff --git a/lib/Conversion/AffineToStaticLogic/AffineToStaticLogic.cpp b/lib/Conversion/AffineToStaticLogic/AffineToStaticLogic.cpp
@@ -8,26 +8,35 @@
 
 #include "circt/Conversion/AffineToStaticLogic.h"
 #include "../PassDetail.h"
+#include "circt/Analysis/DependenceAnalysis.h"
 #include "circt/Analysis/SchedulingAnalysis.h"
 #include "circt/Dialect/StaticLogic/StaticLogic.h"
 #include "circt/Scheduling/Algorithms.h"
 #include "circt/Scheduling/Problems.h"
 #include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/StandardOps/IR/Ops.h"
+#include "mlir/IR/BlockAndValueMapping.h"
+#include "mlir/IR/Dominance.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/LoopUtils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
+#include <mlir/IR/BuiltinDialect.h>
 
 #define DEBUG_TYPE "affine-to-staticlogic"
 
 using namespace mlir;
 using namespace mlir::arith;
+using namespace mlir::memref;
+using namespace mlir::scf;
 using namespace circt;
 using namespace circt::analysis;
 using namespace circt::scheduling;
@@ -40,6 +49,8 @@ struct AffineToStaticLogic
   void runOnFunction() override;
 
 private:
+  LogicalResult
+  lowerAffineStructures(MemoryDependenceAnalysis &dependenceAnalysis);
   LogicalResult populateOperatorTypes(SmallVectorImpl<AffineForOp> &loopNest);
   LogicalResult solveSchedulingProblem(SmallVectorImpl<AffineForOp> &loopNest);
   LogicalResult
@@ -51,6 +62,13 @@ struct AffineToStaticLogic
 } // namespace
 
 void AffineToStaticLogic::runOnFunction() {
+  // Get dependence analysis for the whole function.
+  auto dependenceAnalysis = getAnalysis<MemoryDependenceAnalysis>();
+
+  // After dependence analysis, materialize affine structures.
+  if (failed(lowerAffineStructures(dependenceAnalysis)))
+    return signalPassFailure();
+
   // Get scheduling analysis for the whole function.
   schedulingAnalysis = &getAnalysis<CyclicSchedulingAnalysis>();
 
@@ -78,6 +96,141 @@ void AffineToStaticLogic::runOnFunction() {
   }
 }
 
+/// Apply the affine map from an 'affine.load' operation to its operands, and
+/// feed the results to a newly created 'memref.load' operation (which replaces
+/// the original 'affine.load').
+/// Also replaces the affine load with the memref load in dependenceAnalysis.
+/// TODO(mikeurbach): this is copied from AffineToStandard, see if we can reuse.
+class AffineLoadLowering : public OpConversionPattern<AffineLoadOp> {
+public:
+  AffineLoadLowering(MLIRContext *context,
+                     MemoryDependenceAnalysis &dependenceAnalysis)
+      : OpConversionPattern(context), dependenceAnalysis(dependenceAnalysis) {}
+
+  LogicalResult
+  matchAndRewrite(AffineLoadOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Expand affine map from 'affineLoadOp'.
+    SmallVector<Value, 8> indices(op.getMapOperands());
+    auto resultOperands =
+        expandAffineMap(rewriter, op.getLoc(), op.getAffineMap(), indices);
+    if (!resultOperands)
+      return failure();
+
+    // Build memref.load memref[expandedMap.results].
+    auto memrefLoad = rewriter.replaceOpWithNewOp<memref::LoadOp>(
+        op, op.getMemRef(), *resultOperands);
+
+    dependenceAnalysis.replaceOp(op, memrefLoad);
+
+    return success();
+  }
+
+private:
+  MemoryDependenceAnalysis &dependenceAnalysis;
+};
+
+/// Apply the affine map from an 'affine.store' operation to its operands, and
+/// feed the results to a newly created 'memref.store' operation (which replaces
+/// the original 'affine.store').
+/// Also replaces the affine store with the memref store in dependenceAnalysis.
+/// TODO(mikeurbach): this is copied from AffineToStandard, see if we can reuse.
+class AffineStoreLowering : public OpConversionPattern<AffineStoreOp> {
+public:
+  AffineStoreLowering(MLIRContext *context,
+                      MemoryDependenceAnalysis &dependenceAnalysis)
+      : OpConversionPattern(context), dependenceAnalysis(dependenceAnalysis) {}
+
+  LogicalResult
+  matchAndRewrite(AffineStoreOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Expand affine map from 'affineStoreOp'.
+    SmallVector<Value, 8> indices(op.getMapOperands());
+    auto maybeExpandedMap =
+        expandAffineMap(rewriter, op.getLoc(), op.getAffineMap(), indices);
+    if (!maybeExpandedMap)
+      return failure();
+
+    // Build memref.store valueToStore, memref[expandedMap.results].
+    auto memrefStore = rewriter.replaceOpWithNewOp<memref::StoreOp>(
+        op, op.getValueToStore(), op.getMemRef(), *maybeExpandedMap);
+
+    dependenceAnalysis.replaceOp(op, memrefStore);
+
+    return success();
+  }
+
+private:
+  MemoryDependenceAnalysis &dependenceAnalysis;
+};
+
+/// Helper to hoist computation out of scf::IfOp branches, turning it into a
+/// mux-like operation, and exposing potentially concurrent execution of its
+/// branches.
+struct IfOpHoisting : OpConversionPattern<IfOp> {
+  using OpConversionPattern<IfOp>::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(IfOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.updateRootInPlace(op, [&]() {
+      if (!op.thenBlock()->without_terminator().empty()) {
+        rewriter.splitBlock(op.thenBlock(), --op.thenBlock()->end());
+        rewriter.mergeBlockBefore(&op.thenRegion().front(), op);
+      }
+      if (op.elseBlock() && !op.elseBlock()->without_terminator().empty()) {
+        rewriter.splitBlock(op.elseBlock(), --op.elseBlock()->end());
+        rewriter.mergeBlockBefore(&op.elseRegion().front(), op);
+      }
+    });
+
+    return success();
+  }
+};
+
+/// Helper to determine if an scf::IfOp is in mux-like form.
+static bool ifOpLegalityCallback(IfOp op) {
+  return op.thenBlock()->without_terminator().empty() &&
+         (!op.elseBlock() || op.elseBlock()->without_terminator().empty());
+}
+
+/// Helper to mark AffineYieldOp legal, unless it is inside a partially
+/// converted scf::IfOp.
+static bool yieldOpLegalityCallback(AffineYieldOp op) {
+  return !op->getParentOfType<IfOp>();
+}
+
+/// After analyzing memory dependences, and before creating the schedule, we
+/// want to materialize affine operations with arithmetic, scf, and memref
+/// operations, which make the condition computation of addresses, etc.
+/// explicit. This is important so the schedule can consider potentially complex
+/// computations in the condition of ifs, or the addresses of loads and stores.
+/// The dependence analysis will be updated so the dependences from the affine
+/// loads and stores are now on the memref loads and stores.
+LogicalResult AffineToStaticLogic::lowerAffineStructures(
+    MemoryDependenceAnalysis &dependenceAnalysis) {
+  auto *context = &getContext();
+  auto op = getOperation();
+
+  ConversionTarget target(*context);
+  target.addLegalDialect<AffineDialect, ArithmeticDialect, MemRefDialect,
+                         SCFDialect>();
+  target.addIllegalOp<AffineIfOp, AffineLoadOp, AffineStoreOp>();
+  target.addDynamicallyLegalOp<IfOp>(ifOpLegalityCallback);
+  target.addDynamicallyLegalOp<AffineYieldOp>(yieldOpLegalityCallback);
+
+  RewritePatternSet patterns(context);
+  populateAffineToStdConversionPatterns(patterns);
+  patterns.add<AffineLoadLowering>(context, dependenceAnalysis);
+  patterns.add<AffineStoreLowering>(context, dependenceAnalysis);
+  patterns.add<IfOpHoisting>(context);
+
+  if (failed(applyPartialConversion(op, target, std::move(patterns))))
+    return failure();
+
+  return success();
+}
+
 /// Populate the schedling problem operator types for the dialect we are
 /// targetting. Right now, we assume Calyx, which has a standard library with
 /// well-defined operator latencies. Ultimately, we should move this to a
@@ -103,13 +256,13 @@ LogicalResult AffineToStaticLogic::populateOperatorTypes(
   Operation *unsupported;
   WalkResult result = forOp.getBody()->walk([&](Operation *op) {
     return TypeSwitch<Operation *, WalkResult>(op)
-        .Case<AddIOp, AffineIfOp, AffineYieldOp, mlir::ConstantOp, IndexCastOp,
-              memref::AllocaOp>([&](Operation *combOp) {
+        .Case<AddIOp, IfOp, AffineYieldOp, arith::ConstantOp, CmpIOp,
+              IndexCastOp, memref::AllocaOp, YieldOp>([&](Operation *combOp) {
           // Some known combinational ops.
           problem.setLinkedOperatorType(combOp, combOpr);
           return WalkResult::advance();
         })
-        .Case<AffineReadOpInterface, AffineWriteOpInterface>(
+        .Case<AffineLoadOp, AffineStoreOp, memref::LoadOp, memref::StoreOp>(
             [&](Operation *seqOp) {
               // Some known sequential ops. In certain cases, reads may be
               // combinational in Calyx, but taking advantage of that is left as
@@ -164,6 +317,10 @@ LogicalResult AffineToStaticLogic::solveSchedulingProblem(
   if (failed(scheduleSimplex(problem, anchor)))
     return failure();
 
+  // Verify the solution.
+  if (failed(problem.verify()))
+    return failure();
+
   // Optionally debug problem outputs.
   LLVM_DEBUG({
     llvm::dbgs() << "Scheduled initiation interval = "
@@ -194,13 +351,13 @@ LogicalResult AffineToStaticLogic::createStaticLogicPipeline(
   // Create constants for the loop's lower and upper bounds.
   int64_t lbValue = innerLoop.getConstantLowerBound();
   auto lowerBound = builder.create<arith::ConstantOp>(
-      IntegerAttr::get(builder.getI64Type(), lbValue));
+      IntegerAttr::get(builder.getIndexType(), lbValue));
   int64_t ubValue = innerLoop.getConstantUpperBound();
   auto upperBound = builder.create<arith::ConstantOp>(
-      IntegerAttr::get(builder.getI64Type(), ubValue));
+      IntegerAttr::get(builder.getIndexType(), ubValue));
   int64_t stepValue = innerLoop.getStep();
   auto step = builder.create<arith::ConstantOp>(
-      IntegerAttr::get(builder.getI64Type(), stepValue));
+      IntegerAttr::get(builder.getIndexType(), stepValue));
 
   // Create the pipeline op, with the same result types as the inner loop. An
   // iter arg is created for the induction variable.
@@ -225,26 +382,129 @@ LogicalResult AffineToStaticLogic::createStaticLogicPipeline(
       upperBound);
   condBlock.getTerminator()->insertOperands(0, {cmpResult});
 
-  // Create the first stage.
+  // Add the non-yield operations to their start time groups.
+  DenseMap<unsigned, SmallVector<Operation *>> startGroups;
+  for (auto *op : problem.getOperations()) {
+    if (isa<AffineYieldOp, YieldOp>(op))
+      continue;
+    auto startTime = problem.getStartTime(op);
+    startGroups[*startTime].push_back(op);
+  }
+
+  // Maintain mappings of values in the loop body and results of stages,
+  // initially populated with the iter args.
+  BlockAndValueMapping valueMap;
+  for (size_t i = 0; i < iterArgs.size(); ++i)
+    valueMap.map(forOp.getBody()->getArgument(i),
+                 pipeline.getStagesBlock().getArgument(i));
+
+  // Create the stages.
   Block &stagesBlock = pipeline.getStagesBlock();
   builder.setInsertionPointToStart(&stagesBlock);
-  auto stage = builder.create<PipelineStageOp>(lowerBound.getType());
-  auto &stageBlock = stage.getBodyBlock();
-  builder.setInsertionPointToStart(&stageBlock);
 
-  // Add the induction variable increment to the first stage.
-  auto incResult =
-      builder.create<arith::AddIOp>(stagesBlock.getArgument(0), step);
-  stageBlock.getTerminator()->insertOperands(0, {incResult});
+  // Iterate in order of the start times.
+  SmallVector<unsigned> startTimes;
+  for (auto group : startGroups)
+    startTimes.push_back(group.first);
+  llvm::sort(startTimes);
+
+  DominanceInfo dom(getOperation());
+  for (auto startTime : startTimes) {
+    auto group = startGroups[startTime];
+    OpBuilder::InsertionGuard g(builder);
+
+    // Collect the return types for this stage. Operations whose results are not
+    // used within this stage are returned.
+    auto isLoopTerminator = [forOp](Operation *op) {
+      return isa<AffineYieldOp>(op) && op->getParentOp() == forOp;
+    };
+    SmallVector<Type> stageTypes;
+    DenseSet<Operation *> opsWithReturns;
+    for (auto *op : group) {
+      for (auto *user : op->getUsers()) {
+        if (*problem.getStartTime(user) > startTime || isLoopTerminator(user)) {
+          opsWithReturns.insert(op);
+          stageTypes.append(op->getResultTypes().begin(),
+                            op->getResultTypes().end());
+        }
+      }
+    }
+
+    // Add the induction variable increment in the first stage.
+    if (startTime == 0)
+      stageTypes.push_back(lowerBound.getType());
+
+    // Create the stage itself.
+    auto stage = builder.create<PipelineStageOp>(stageTypes);
+    auto &stageBlock = stage.getBodyBlock();
+    auto *stageTerminator = stageBlock.getTerminator();
+    builder.setInsertionPointToStart(&stageBlock);
+
+    // Sort the group according to original dominance.
+    llvm::sort(group,
+               [&](Operation *a, Operation *b) { return dom.dominates(a, b); });
+
+    // Move over the operations and add their results to the terminator.
+    SmallVector<std::tuple<Operation *, Operation *, unsigned>> movedOps;
+    for (auto *op : group) {
+      unsigned resultIndex = stageTerminator->getNumOperands();
+      auto *newOp = builder.clone(*op, valueMap);
+      if (opsWithReturns.contains(op)) {
+        stageTerminator->insertOperands(resultIndex, newOp->getResults());
+        movedOps.emplace_back(op, newOp, resultIndex);
+      }
+    }
+
+    // Add the stage results to the value map for the original op.
+    for (auto tuple : movedOps) {
+      Operation *op = std::get<0>(tuple);
+      Operation *newOp = std::get<1>(tuple);
+      unsigned resultIndex = std::get<2>(tuple);
+      for (size_t i = 0; i < newOp->getNumResults(); ++i) {
+        auto newValue = stage->getResult(resultIndex + i);
+        auto oldValue = op->getResult(i);
+        valueMap.map(oldValue, newValue);
+      }
+    }
+
+    // Add the induction variable increment to the first stage.
+    if (startTime == 0) {
+      auto incResult =
+          builder.create<arith::AddIOp>(stagesBlock.getArgument(0), step);
+      stageTerminator->insertOperands(stageTerminator->getNumOperands(),
+                                      incResult->getResults());
+    }
+  }
 
-  // Add the induction variable result to the terminator iter args.
+  // Add the iter args and results to the terminator.
   auto stagesTerminator =
       cast<PipelineTerminatorOp>(stagesBlock.getTerminator());
-  stagesTerminator.iter_argsMutable().append({stage.getResult(0)});
+
+  // Collect iter args and results from the induction variable increment and any
+  // mapped values that were originally yielded.
+  SmallVector<Value> termIterArgs;
+  SmallVector<Value> termResults;
+  termIterArgs.push_back(
+      stagesBlock.front().getResult(stagesBlock.front().getNumResults() - 1));
+  for (auto value : forOp.getBody()->getTerminator()->getOperands()) {
+    termIterArgs.push_back(valueMap.lookup(value));
+    termResults.push_back(valueMap.lookup(value));
+  }
+
+  stagesTerminator.iter_argsMutable().append(termIterArgs);
+  stagesTerminator.resultsMutable().append(termResults);
+
+  // Replace loop results with pipeline results.
+  for (size_t i = 0; i < forOp.getNumResults(); ++i)
+    forOp.getResult(i).replaceAllUsesWith(pipeline.getResult(i));
 
   // Remove the loop nest from the IR.
-  for (auto loop : llvm::reverse(loopNest))
-    loop.erase();
+  loopNest.front().walk([](Operation *op) {
+    op->dropAllUses();
+    op->dropAllDefinedValueUses();
+    op->dropAllReferences();
+    op->erase();
+  });
 
   return success();
 }