Skip to content

Commit

Permalink
[AffineToStaticLogic] Flesh out support for converting to pipelines. (#…
Browse files Browse the repository at this point in the history
…2387)

This adds the necessary support to lower single affine loops to
pipelines. Broadly, this encompasses lowering affine structures like
if conditions into arithmetic, and building pipeline stages for each
scheduled group.
  • Loading branch information
mikeurbach authored Dec 24, 2021
1 parent 86b1a2a commit fba1ab7
Show file tree
Hide file tree
Showing 5 changed files with 327 additions and 20 deletions.
5 changes: 4 additions & 1 deletion include/circt/Conversion/Passes.td
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,10 @@ def AffineToStaticLogic : FunctionPass<"convert-affine-to-staticlogic"> {
let constructor = "circt::createAffineToStaticLogic()";
let dependentDialects = [
"circt::staticlogic::StaticLogicDialect",
"mlir::arith::ArithmeticDialect"
"mlir::arith::ArithmeticDialect",
"mlir::memref::MemRefDialect",
"mlir::scf::SCFDialect",
"mlir::StandardOpsDialect"
];
}

Expand Down
296 changes: 278 additions & 18 deletions lib/Conversion/AffineToStaticLogic/AffineToStaticLogic.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,26 +8,35 @@

#include "circt/Conversion/AffineToStaticLogic.h"
#include "../PassDetail.h"
#include "circt/Analysis/DependenceAnalysis.h"
#include "circt/Analysis/SchedulingAnalysis.h"
#include "circt/Dialect/StaticLogic/StaticLogic.h"
#include "circt/Scheduling/Algorithms.h"
#include "circt/Scheduling/Problems.h"
#include "mlir/Analysis/AffineAnalysis.h"
#include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
#include "mlir/Dialect/Affine/IR/AffineMemoryOpInterfaces.h"
#include "mlir/Dialect/Affine/IR/AffineOps.h"
#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
#include "mlir/Dialect/MemRef/IR/MemRef.h"
#include "mlir/Dialect/SCF/SCF.h"
#include "mlir/Dialect/StandardOps/IR/Ops.h"
#include "mlir/IR/BlockAndValueMapping.h"
#include "mlir/IR/Dominance.h"
#include "mlir/IR/ImplicitLocOpBuilder.h"
#include "mlir/Transforms/DialectConversion.h"
#include "mlir/Transforms/LoopUtils.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/TypeSwitch.h"
#include "llvm/Support/Debug.h"
#include <mlir/IR/BuiltinDialect.h>

#define DEBUG_TYPE "affine-to-staticlogic"

using namespace mlir;
using namespace mlir::arith;
using namespace mlir::memref;
using namespace mlir::scf;
using namespace circt;
using namespace circt::analysis;
using namespace circt::scheduling;
Expand All @@ -40,6 +49,8 @@ struct AffineToStaticLogic
void runOnFunction() override;

private:
LogicalResult
lowerAffineStructures(MemoryDependenceAnalysis &dependenceAnalysis);
LogicalResult populateOperatorTypes(SmallVectorImpl<AffineForOp> &loopNest);
LogicalResult solveSchedulingProblem(SmallVectorImpl<AffineForOp> &loopNest);
LogicalResult
Expand All @@ -51,6 +62,13 @@ struct AffineToStaticLogic
} // namespace

void AffineToStaticLogic::runOnFunction() {
// Get dependence analysis for the whole function.
auto dependenceAnalysis = getAnalysis<MemoryDependenceAnalysis>();

// After dependence analysis, materialize affine structures.
if (failed(lowerAffineStructures(dependenceAnalysis)))
return signalPassFailure();

// Get scheduling analysis for the whole function.
schedulingAnalysis = &getAnalysis<CyclicSchedulingAnalysis>();

Expand Down Expand Up @@ -78,6 +96,141 @@ void AffineToStaticLogic::runOnFunction() {
}
}

/// Apply the affine map from an 'affine.load' operation to its operands, and
/// feed the results to a newly created 'memref.load' operation (which replaces
/// the original 'affine.load').
/// Also replaces the affine load with the memref load in dependenceAnalysis.
/// TODO(mikeurbach): this is copied from AffineToStandard, see if we can reuse.
class AffineLoadLowering : public OpConversionPattern<AffineLoadOp> {
public:
AffineLoadLowering(MLIRContext *context,
MemoryDependenceAnalysis &dependenceAnalysis)
: OpConversionPattern(context), dependenceAnalysis(dependenceAnalysis) {}

LogicalResult
matchAndRewrite(AffineLoadOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
// Expand affine map from 'affineLoadOp'.
SmallVector<Value, 8> indices(op.getMapOperands());
auto resultOperands =
expandAffineMap(rewriter, op.getLoc(), op.getAffineMap(), indices);
if (!resultOperands)
return failure();

// Build memref.load memref[expandedMap.results].
auto memrefLoad = rewriter.replaceOpWithNewOp<memref::LoadOp>(
op, op.getMemRef(), *resultOperands);

dependenceAnalysis.replaceOp(op, memrefLoad);

return success();
}

private:
MemoryDependenceAnalysis &dependenceAnalysis;
};

/// Apply the affine map from an 'affine.store' operation to its operands, and
/// feed the results to a newly created 'memref.store' operation (which replaces
/// the original 'affine.store').
/// Also replaces the affine store with the memref store in dependenceAnalysis.
/// TODO(mikeurbach): this is copied from AffineToStandard, see if we can reuse.
class AffineStoreLowering : public OpConversionPattern<AffineStoreOp> {
public:
AffineStoreLowering(MLIRContext *context,
MemoryDependenceAnalysis &dependenceAnalysis)
: OpConversionPattern(context), dependenceAnalysis(dependenceAnalysis) {}

LogicalResult
matchAndRewrite(AffineStoreOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
// Expand affine map from 'affineStoreOp'.
SmallVector<Value, 8> indices(op.getMapOperands());
auto maybeExpandedMap =
expandAffineMap(rewriter, op.getLoc(), op.getAffineMap(), indices);
if (!maybeExpandedMap)
return failure();

// Build memref.store valueToStore, memref[expandedMap.results].
auto memrefStore = rewriter.replaceOpWithNewOp<memref::StoreOp>(
op, op.getValueToStore(), op.getMemRef(), *maybeExpandedMap);

dependenceAnalysis.replaceOp(op, memrefStore);

return success();
}

private:
MemoryDependenceAnalysis &dependenceAnalysis;
};

/// Helper to hoist computation out of scf::IfOp branches, turning it into a
/// mux-like operation, and exposing potentially concurrent execution of its
/// branches.
struct IfOpHoisting : OpConversionPattern<IfOp> {
using OpConversionPattern<IfOp>::OpConversionPattern;

LogicalResult
matchAndRewrite(IfOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
rewriter.updateRootInPlace(op, [&]() {
if (!op.thenBlock()->without_terminator().empty()) {
rewriter.splitBlock(op.thenBlock(), --op.thenBlock()->end());
rewriter.mergeBlockBefore(&op.thenRegion().front(), op);
}
if (op.elseBlock() && !op.elseBlock()->without_terminator().empty()) {
rewriter.splitBlock(op.elseBlock(), --op.elseBlock()->end());
rewriter.mergeBlockBefore(&op.elseRegion().front(), op);
}
});

return success();
}
};

/// Helper to determine if an scf::IfOp is in mux-like form.
static bool ifOpLegalityCallback(IfOp op) {
return op.thenBlock()->without_terminator().empty() &&
(!op.elseBlock() || op.elseBlock()->without_terminator().empty());
}

/// Helper to mark AffineYieldOp legal, unless it is inside a partially
/// converted scf::IfOp.
static bool yieldOpLegalityCallback(AffineYieldOp op) {
return !op->getParentOfType<IfOp>();
}

/// After analyzing memory dependences, and before creating the schedule, we
/// want to materialize affine operations with arithmetic, scf, and memref
/// operations, which make the condition computation of addresses, etc.
/// explicit. This is important so the schedule can consider potentially complex
/// computations in the condition of ifs, or the addresses of loads and stores.
/// The dependence analysis will be updated so the dependences from the affine
/// loads and stores are now on the memref loads and stores.
LogicalResult AffineToStaticLogic::lowerAffineStructures(
MemoryDependenceAnalysis &dependenceAnalysis) {
auto *context = &getContext();
auto op = getOperation();

ConversionTarget target(*context);
target.addLegalDialect<AffineDialect, ArithmeticDialect, MemRefDialect,
SCFDialect>();
target.addIllegalOp<AffineIfOp, AffineLoadOp, AffineStoreOp>();
target.addDynamicallyLegalOp<IfOp>(ifOpLegalityCallback);
target.addDynamicallyLegalOp<AffineYieldOp>(yieldOpLegalityCallback);

RewritePatternSet patterns(context);
populateAffineToStdConversionPatterns(patterns);
patterns.add<AffineLoadLowering>(context, dependenceAnalysis);
patterns.add<AffineStoreLowering>(context, dependenceAnalysis);
patterns.add<IfOpHoisting>(context);

if (failed(applyPartialConversion(op, target, std::move(patterns))))
return failure();

return success();
}

/// Populate the schedling problem operator types for the dialect we are
/// targetting. Right now, we assume Calyx, which has a standard library with
/// well-defined operator latencies. Ultimately, we should move this to a
Expand All @@ -103,13 +256,13 @@ LogicalResult AffineToStaticLogic::populateOperatorTypes(
Operation *unsupported;
WalkResult result = forOp.getBody()->walk([&](Operation *op) {
return TypeSwitch<Operation *, WalkResult>(op)
.Case<AddIOp, AffineIfOp, AffineYieldOp, mlir::ConstantOp, IndexCastOp,
memref::AllocaOp>([&](Operation *combOp) {
.Case<AddIOp, IfOp, AffineYieldOp, arith::ConstantOp, CmpIOp,
IndexCastOp, memref::AllocaOp, YieldOp>([&](Operation *combOp) {
// Some known combinational ops.
problem.setLinkedOperatorType(combOp, combOpr);
return WalkResult::advance();
})
.Case<AffineReadOpInterface, AffineWriteOpInterface>(
.Case<AffineLoadOp, AffineStoreOp, memref::LoadOp, memref::StoreOp>(
[&](Operation *seqOp) {
// Some known sequential ops. In certain cases, reads may be
// combinational in Calyx, but taking advantage of that is left as
Expand Down Expand Up @@ -164,6 +317,10 @@ LogicalResult AffineToStaticLogic::solveSchedulingProblem(
if (failed(scheduleSimplex(problem, anchor)))
return failure();

// Verify the solution.
if (failed(problem.verify()))
return failure();

// Optionally debug problem outputs.
LLVM_DEBUG({
llvm::dbgs() << "Scheduled initiation interval = "
Expand Down Expand Up @@ -194,13 +351,13 @@ LogicalResult AffineToStaticLogic::createStaticLogicPipeline(
// Create constants for the loop's lower and upper bounds.
int64_t lbValue = innerLoop.getConstantLowerBound();
auto lowerBound = builder.create<arith::ConstantOp>(
IntegerAttr::get(builder.getI64Type(), lbValue));
IntegerAttr::get(builder.getIndexType(), lbValue));
int64_t ubValue = innerLoop.getConstantUpperBound();
auto upperBound = builder.create<arith::ConstantOp>(
IntegerAttr::get(builder.getI64Type(), ubValue));
IntegerAttr::get(builder.getIndexType(), ubValue));
int64_t stepValue = innerLoop.getStep();
auto step = builder.create<arith::ConstantOp>(
IntegerAttr::get(builder.getI64Type(), stepValue));
IntegerAttr::get(builder.getIndexType(), stepValue));

// Create the pipeline op, with the same result types as the inner loop. An
// iter arg is created for the induction variable.
Expand All @@ -225,26 +382,129 @@ LogicalResult AffineToStaticLogic::createStaticLogicPipeline(
upperBound);
condBlock.getTerminator()->insertOperands(0, {cmpResult});

// Create the first stage.
// Add the non-yield operations to their start time groups.
DenseMap<unsigned, SmallVector<Operation *>> startGroups;
for (auto *op : problem.getOperations()) {
if (isa<AffineYieldOp, YieldOp>(op))
continue;
auto startTime = problem.getStartTime(op);
startGroups[*startTime].push_back(op);
}

// Maintain mappings of values in the loop body and results of stages,
// initially populated with the iter args.
BlockAndValueMapping valueMap;
for (size_t i = 0; i < iterArgs.size(); ++i)
valueMap.map(forOp.getBody()->getArgument(i),
pipeline.getStagesBlock().getArgument(i));

// Create the stages.
Block &stagesBlock = pipeline.getStagesBlock();
builder.setInsertionPointToStart(&stagesBlock);
auto stage = builder.create<PipelineStageOp>(lowerBound.getType());
auto &stageBlock = stage.getBodyBlock();
builder.setInsertionPointToStart(&stageBlock);

// Add the induction variable increment to the first stage.
auto incResult =
builder.create<arith::AddIOp>(stagesBlock.getArgument(0), step);
stageBlock.getTerminator()->insertOperands(0, {incResult});
// Iterate in order of the start times.
SmallVector<unsigned> startTimes;
for (auto group : startGroups)
startTimes.push_back(group.first);
llvm::sort(startTimes);

DominanceInfo dom(getOperation());
for (auto startTime : startTimes) {
auto group = startGroups[startTime];
OpBuilder::InsertionGuard g(builder);

// Collect the return types for this stage. Operations whose results are not
// used within this stage are returned.
auto isLoopTerminator = [forOp](Operation *op) {
return isa<AffineYieldOp>(op) && op->getParentOp() == forOp;
};
SmallVector<Type> stageTypes;
DenseSet<Operation *> opsWithReturns;
for (auto *op : group) {
for (auto *user : op->getUsers()) {
if (*problem.getStartTime(user) > startTime || isLoopTerminator(user)) {
opsWithReturns.insert(op);
stageTypes.append(op->getResultTypes().begin(),
op->getResultTypes().end());
}
}
}

// Add the induction variable increment in the first stage.
if (startTime == 0)
stageTypes.push_back(lowerBound.getType());

// Create the stage itself.
auto stage = builder.create<PipelineStageOp>(stageTypes);
auto &stageBlock = stage.getBodyBlock();
auto *stageTerminator = stageBlock.getTerminator();
builder.setInsertionPointToStart(&stageBlock);

// Sort the group according to original dominance.
llvm::sort(group,
[&](Operation *a, Operation *b) { return dom.dominates(a, b); });

// Move over the operations and add their results to the terminator.
SmallVector<std::tuple<Operation *, Operation *, unsigned>> movedOps;
for (auto *op : group) {
unsigned resultIndex = stageTerminator->getNumOperands();
auto *newOp = builder.clone(*op, valueMap);
if (opsWithReturns.contains(op)) {
stageTerminator->insertOperands(resultIndex, newOp->getResults());
movedOps.emplace_back(op, newOp, resultIndex);
}
}

// Add the stage results to the value map for the original op.
for (auto tuple : movedOps) {
Operation *op = std::get<0>(tuple);
Operation *newOp = std::get<1>(tuple);
unsigned resultIndex = std::get<2>(tuple);
for (size_t i = 0; i < newOp->getNumResults(); ++i) {
auto newValue = stage->getResult(resultIndex + i);
auto oldValue = op->getResult(i);
valueMap.map(oldValue, newValue);
}
}

// Add the induction variable increment to the first stage.
if (startTime == 0) {
auto incResult =
builder.create<arith::AddIOp>(stagesBlock.getArgument(0), step);
stageTerminator->insertOperands(stageTerminator->getNumOperands(),
incResult->getResults());
}
}

// Add the induction variable result to the terminator iter args.
// Add the iter args and results to the terminator.
auto stagesTerminator =
cast<PipelineTerminatorOp>(stagesBlock.getTerminator());
stagesTerminator.iter_argsMutable().append({stage.getResult(0)});

// Collect iter args and results from the induction variable increment and any
// mapped values that were originally yielded.
SmallVector<Value> termIterArgs;
SmallVector<Value> termResults;
termIterArgs.push_back(
stagesBlock.front().getResult(stagesBlock.front().getNumResults() - 1));
for (auto value : forOp.getBody()->getTerminator()->getOperands()) {
termIterArgs.push_back(valueMap.lookup(value));
termResults.push_back(valueMap.lookup(value));
}

stagesTerminator.iter_argsMutable().append(termIterArgs);
stagesTerminator.resultsMutable().append(termResults);

// Replace loop results with pipeline results.
for (size_t i = 0; i < forOp.getNumResults(); ++i)
forOp.getResult(i).replaceAllUsesWith(pipeline.getResult(i));

// Remove the loop nest from the IR.
for (auto loop : llvm::reverse(loopNest))
loop.erase();
loopNest.front().walk([](Operation *op) {
op->dropAllUses();
op->dropAllDefinedValueUses();
op->dropAllReferences();
op->erase();
});

return success();
}
Expand Down
Loading

0 comments on commit fba1ab7

Please sign in to comment.