[feature] support batch_matmul

tancheng · tancheng · commit 2f37716f0dad · 2022-11-13T08:25:55.000-08:00
diff --git a/include/soda/Conversion/KernelsToSODA/LinalgToCGRA.h b/include/soda/Conversion/KernelsToSODA/LinalgToCGRA.h
@@ -11,6 +11,7 @@ struct LogicalResult;
 namespace linalg {
 // class DotOp;
 class MatmulOp;
+class BatchMatmulOp;
 class Conv2DOp;
 class GenericOp;
 // TODO: add more ops
@@ -22,6 +23,9 @@ class GenericOp;
 /// Convert linalg Matmul op into CGRA.
 LogicalResult convertLinalgMatmulToCGRALaunch(linalg::MatmulOp matmulOp);
 
+/// Convert linalg BatchMatmul op into CGRA.
+LogicalResult convertLinalgBatchMatmulToCGRALaunch(linalg::BatchMatmulOp batchMatmulOp);
+
 /// Convert linalg Conv op into CGRA.
 LogicalResult convertLinalgConvToCGRALaunch(linalg::Conv2DOp convOp);
 
diff --git a/include/soda/Conversion/KernelsToSODA/LinalgToCGRAPass.h b/include/soda/Conversion/KernelsToSODA/LinalgToCGRAPass.h
@@ -18,6 +18,7 @@ class Pass;
 /// Create a pass that converts linalg ops into soda launch ops.
 // std::unique_ptr<OperationPass<func::FuncOp>> createLinalgDotToSODAPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createLinalgMatmulToCGRAPass();
+std::unique_ptr<OperationPass<func::FuncOp>> createLinalgBatchMatmulToCGRAPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createLinalgConvToCGRAPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createLinalgGenericToCGRAPass();
 
diff --git a/include/soda/Conversion/Passes.td b/include/soda/Conversion/Passes.td
@@ -87,6 +87,12 @@ def ConvertLinalgMatmulToCGRA : Pass<"convert-linalg-matmul-to-cgra", "func::Fun
   let dependentDialects = ["soda::SODADialect"];
 }
 
+def ConvertLinalgBatchMatmulToCGRA : Pass<"convert-linalg-batch_matmul-to-cgra", "func::FuncOp"> {
+  let summary = "Offload (nested) linalg::batch_matmul Ops for CGRA acceleration";
+  let constructor = "mlir::createLinalgBatchMatmulToCGRAPass()";
+  let dependentDialects = ["soda::SODADialect"];
+}
+
 def ConvertLinalgConvToCGRA : Pass<"convert-linalg-conv-to-cgra", "func::FuncOp"> {
   let summary = "Offload (nested) linalg::conv Ops for CGRA acceleration";
   let constructor = "mlir::createLinalgConvToCGRAPass()";
diff --git a/include/soda/Dialect/SODA/SODAOps.td b/include/soda/Dialect/SODA/SODAOps.td
@@ -453,6 +453,24 @@ def SODA_MatmulOp : SODA_Op<"cgra.matmul", [NoSideEffect]>,
   // let hasVerifier = 1;
 }
 
+def SODA_BatchMatmulOp : SODA_Op<"cgra.batch_matmul", [NoSideEffect]>,
+    Arguments<(ins Variadic<AnyType>:$operands)>, Results<(outs)> {
+  let summary = "CGRA BatchMatmul operation.";
+  let description = [{
+    An soda operation `cgra.batch_matmul` to replace `linalg.batch_matmul`. The operands and
+    output are the same.
+  }];
+
+  let builders = [OpBuilder<(ins), [{ // empty}]>];
+
+  let arguments = (ins Variadic<AnyType>:$operands);
+  // let arguments = (ins AnyType:$operandA, AnyType:$operandB, AnyType:$operandC);
+
+  let assemblyFormat = "attr-dict ($operands^ `:` type($operands))?";
+
+  // let hasVerifier = 1;
+}
+
 def SODA_FusionOp : SODA_Op<"cgra.fusion", [NoSideEffect]>,
     Arguments<(ins Variadic<AnyType>:$operands)>, Results<(outs)> {
   let summary = "CGRA fused operation.";
diff --git a/include/soda/Dialect/SODA/Utils.h b/include/soda/Dialect/SODA/Utils.h
@@ -25,6 +25,7 @@ namespace soda {
 class SODAFuncOp;
 class LaunchOp;
 class MatmulOp;
+class BatchMatmulOp;
 } // namespace soda
 
 /// Get a soda.func created from outlining the region of a soda.launch op with the
diff --git a/lib/Conversion/KernelsToSODA/LinalgToCGRA.cpp b/lib/Conversion/KernelsToSODA/LinalgToCGRA.cpp
@@ -28,6 +28,8 @@ struct LinalgToCGRAConverter {
   template <class T>
   void createMatmulLaunch(T rootOp);
   template <class T>
+  void createBatchMatmulLaunch(T rootOp);
+  template <class T>
   void createGenericLaunch(T rootOp);
 };
 
@@ -84,6 +86,29 @@ void LinalgToCGRAConverter::createMatmulLaunch(T rootLinalgOp) {
   }
 }
 
+
+/// Add a CGRA launch operation around the "linalg.batch_matmul" op.
+template <class T>
+void LinalgToCGRAConverter::createBatchMatmulLaunch(T rootLinalgOp) {
+  OpBuilder builder(rootLinalgOp.getOperation());
+
+  if (dyn_cast<linalg::BatchMatmulOp>(&rootLinalgOp) != nullptr) {
+
+    // Create a launch op and move target op into the region
+    Location loc = rootLinalgOp.getLoc();
+    auto launchOp = builder.create<soda::LaunchOp>(loc);
+    builder.setInsertionPointToEnd(&launchOp.body().front());
+    builder.create<soda::TerminatorOp>(loc);
+    builder.setInsertionPointToStart(&launchOp.body().front());
+
+    Operation* newOp = builder.create<soda::BatchMatmulOp>(loc, rootLinalgOp->getOperands());
+
+    auto results = newOp->getResults();
+    rootLinalgOp->replaceAllUsesWith(results);
+    rootLinalgOp->erase();
+  }
+}
+
 /// Add a CGRA launch operation around the "linalg.generic" op.
 template <class T>
 void LinalgToCGRAConverter::createGenericLaunch(T rootLinalgOp) {
@@ -150,6 +175,18 @@ LogicalResult mlir::convertLinalgMatmulToCGRALaunch(linalg::MatmulOp op) {
   return ::convertLinalgMatmulToCGRALaunch(op);
 }
 
+static LogicalResult convertLinalgBatchMatmulToCGRALaunch(linalg::BatchMatmulOp op) {
+
+  LinalgToCGRAConverter converter;
+  converter.createBatchMatmulLaunch(op);
+
+  return success();
+}
+
+LogicalResult mlir::convertLinalgBatchMatmulToCGRALaunch(linalg::BatchMatmulOp op) {
+  return ::convertLinalgBatchMatmulToCGRALaunch(op);
+}
+
 static LogicalResult convertLinalgConvToCGRALaunch(linalg::Conv2DOp op) {
 
   LinalgToCGRAConverter converter;
diff --git a/lib/Conversion/KernelsToSODA/LinalgToCGRAPass.cpp b/lib/Conversion/KernelsToSODA/LinalgToCGRAPass.cpp
@@ -44,8 +44,7 @@ namespace {
 // };
 
 // A pass that traverses top-level matmuls in the function and converts them to
-// SODA launch operations.  Nested launches are not allowed, so this does not
-// walk the function recursively to avoid considering nested matmuls.
+// CGRA launch operations.
 struct LinalgMatmulMapper: public ConvertLinalgMatmulToCGRABase<LinalgMatmulMapper> {
   LinalgMatmulMapper() = default;
 
@@ -73,8 +72,38 @@ struct LinalgMatmulMapper: public ConvertLinalgMatmulToCGRABase<LinalgMatmulMapp
   }
 };
 
+
+// A pass that traverses top-level batch_matmuls in the function and converts them to
+// CGRA launch operations.
+struct LinalgBatchMatmulMapper: public ConvertLinalgBatchMatmulToCGRABase<LinalgBatchMatmulMapper> {
+  LinalgBatchMatmulMapper() = default;
+
+  void runOnInnerOp(scf::ForOp& forOp) {
+    for (Operation &innerOp : llvm::make_early_inc_range(forOp.getBody()->getOperations())) {
+      if (auto innerMatmulOp = dyn_cast<linalg::BatchMatmulOp>(&innerOp)) {
+        if (failed(convertLinalgBatchMatmulToCGRALaunch(innerMatmulOp))) {
+          signalPassFailure();
+        }
+      } else if (auto forOp = dyn_cast<scf::ForOp>(&innerOp)) {
+        runOnInnerOp(forOp);
+      }
+    }
+  }
+
+  void runOnOperation() override {
+    for (Operation &op : llvm::make_early_inc_range(getOperation().getOps())) {
+      if (auto matmulOp = dyn_cast<linalg::BatchMatmulOp>(&op)) {
+        if (failed(convertLinalgBatchMatmulToCGRALaunch(matmulOp)))
+          signalPassFailure();
+      } else if (auto forOp = dyn_cast<scf::ForOp>(&op)) {
+	      runOnInnerOp(forOp);
+      }
+    }
+  }
+};
+
 // A pass that traverses top-level conv in the function and converts them to
-// SODA launch operations.  Nested launches are not allowed, so this does not
+// CGRA launch operations.  Nested launches are not allowed, so this does not
 // walk the function recursively to avoid considering nested conv.
 struct LinalgConvMapper: public ConvertLinalgConvToCGRABase<LinalgConvMapper> {
   LinalgConvMapper() = default;
@@ -90,8 +119,7 @@ struct LinalgConvMapper: public ConvertLinalgConvToCGRABase<LinalgConvMapper> {
 };
 
 // A pass that traverses top-level GenericOps in the function and converts them
-// to SODA launch operations. Nested launches are not allowed, so this does not
-// walk the function recursively to avoid considering nested GenericOp.
+// to CGRA launch operations.
 struct LinalgGenericMapper: public ConvertLinalgGenericToCGRABase<LinalgGenericMapper> {
   LinalgGenericMapper() = default;
 
@@ -130,6 +158,11 @@ mlir::createLinalgMatmulToCGRAPass() {
   return std::make_unique<LinalgMatmulMapper>();
 }
 
+std::unique_ptr<OperationPass<func::FuncOp>>
+mlir::createLinalgBatchMatmulToCGRAPass() {
+  return std::make_unique<LinalgBatchMatmulMapper>();
+}
+
 std::unique_ptr<OperationPass<func::FuncOp>>
 mlir::createLinalgConvToCGRAPass() {
   return std::make_unique<LinalgConvMapper>();
diff --git a/lib/Dialect/SODA/Transforms/HostGeneration.cpp b/lib/Dialect/SODA/Transforms/HostGeneration.cpp
@@ -19,7 +19,10 @@
 #include "soda/Dialect/SODA/Utils.h"
 
 #include <iostream>
+#include <map>
+#include <string>
 
+using namespace std;
 using namespace mlir;
 
 namespace {
@@ -78,6 +81,7 @@ class SODALaunchFuncLowering : public OpRewritePattern<soda::LaunchFuncOp> {
             .str();
 
     auto func = module.lookupSymbol<func::FuncOp>(newName);
+
     if (!func) {
 
       // Get callee
@@ -118,7 +122,11 @@ class SODALaunchCGRALowering : public OpRewritePattern<soda::LaunchCGRAOp> {
     auto newName = "cgra_" + Twine(op.getKernelName()).str();
     auto func = module.lookupSymbol<func::FuncOp>(newName);
 
-    if (!func) {
+    // std::cout<<"found func... "<<newName<<std::endl;
+    while (func) {
+      newName += "_";
+      func = module.lookupSymbol<func::FuncOp>(newName);
+    }
 
       // Get callee
       Operation *kernelFunc = module.lookupSymbol(op.kernelAttr());
@@ -130,12 +138,12 @@ class SODALaunchCGRALowering : public OpRewritePattern<soda::LaunchCGRAOp> {
       if (kernelSODAFunction == NULL)
         std::cout<<"kernelSODAFunction is NULL"<<std::endl;
       FunctionType funcTy = kernelSODAFunction.getFunctionType();
-      func::FuncOp func = rewriter.create<func::FuncOp>(
+      func::FuncOp updatedFunc = rewriter.create<func::FuncOp>(
           rewriter.getUnknownLoc(), newName, funcTy);
-      func.setPrivate();
+      updatedFunc.setPrivate();
 
       rewriter.setInsertionPoint(op);
-    }
+    // }
 
     assert(
         isa<FunctionOpInterface>(SymbolTable::lookupSymbolIn(module, newName)));
diff --git a/lib/Dialect/SODA/Transforms/KernelOutlining.cpp b/lib/Dialect/SODA/Transforms/KernelOutlining.cpp
@@ -350,6 +350,8 @@ class CGRAKernelOutliningPass
           kernelFnName += (*op.body().front().op_begin<soda::FusionOp>())->getAttr("pattern").cast<StringAttr>().str();
         } else if (op.body().front().op_begin<soda::MatmulOp>() != op.body().front().op_end<soda::MatmulOp>()) {
           kernelFnName = "matmul";
+        }  else if (op.body().front().op_begin<soda::BatchMatmulOp>() != op.body().front().op_end<soda::BatchMatmulOp>()) {
+          kernelFnName = "batch_matmul";
         } else {
           kernelFnName = "generic_" + to_string(genericFuncCount);
           isGenericFunc = true;
diff --git a/sim/CGRAFunc.h b/sim/CGRAFunc.h
@@ -36,6 +36,43 @@ void matmul(DataReq& input, DataReq& output, Simulator& sim) {
   }
 }
 
+void batch_matmul(DataReq& input, DataReq& output, Simulator& sim) {
+
+
+  MemRef inA = input.memRefs[0];
+  MemRef inB = input.memRefs[1];
+  MemRef out = output.memRefs[0];
+
+  // walk-around for the current bug in MLIR tiling memref
+  int row = out.offset / (out.sizes[1] * out.strides[1]);
+  int col = out.offset % (out.sizes[1] * out.strides[1]) / out.sizes[2];
+  string locKey = to_string(row) + "," + to_string(col);
+  if (sim.matmulLocCount.find(locKey) == sim.matmulLocCount.end()) {
+    sim.matmulLocCount.insert({locKey, -1});
+  }
+  sim.matmulLocCount[locKey] += 1;
+
+  int64_t offsetA = row * inA.sizes[1] * inA.strides[1] + sim.matmulLocCount[locKey] * inA.sizes[2];
+  int64_t offsetB = col * inB.sizes[2] + sim.matmulLocCount[locKey] * inB.sizes[1] * inB.strides[1];
+
+  cout<<"offsetA: "<<offsetA<<"; offsetB: "<<offsetB<<endl;
+
+  for (int b=0; b<out.sizes[0]; ++b) {
+    for (int i=0; i<out.sizes[1]; ++i) {
+      for (int j=0; j<out.sizes[2]; ++j) {
+        for (int k=0; k<inB.sizes[1]; ++k) {
+          out.aligned[b*out.strides[0]+out.offset+i*out.strides[1]+j] += inA.aligned[b*inA.strides[0]+offsetA+i*inA.strides[1]+k] * inB.aligned[b*inB.strides[0]+offsetB+k*inB.strides[1]+j];
+        }
+      }
+    }
+  }
+
+  // reset the locCount
+  if (sim.matmulLocCount[locKey] == inA.strides[1] / inA.sizes[2] - 1) {
+    sim.matmulLocCount[locKey] = -1;
+  }
+}
+
 void fusion_add_max_add(DataReq& input, DataReq& output, Simulator& sim) {
 
   MemRef inA = input.memRefs[0];
diff --git a/sim/GlobalRuntime.cpp b/sim/GlobalRuntime.cpp
@@ -56,6 +56,82 @@ extern "C" void cgra_matmul(float* a_allocated, float* a_aligned, int64_t a_offs
   */
 }
 
+extern "C" void cgra_matmul_(float* a_allocated, float* a_aligned, int64_t a_offset, int64_t a_size0, int64_t a_size1, int64_t a_stride0, int64_t a_stride1,
+                 float* b_allocated, float* b_aligned, int64_t b_offset, int64_t b_size0, int64_t b_size1, int64_t b_stride0, int64_t b_stride1,
+                 float* c_allocated, float* c_aligned, int64_t c_offset, int64_t c_size0, int64_t c_size1, int64_t c_stride0, int64_t c_stride1) {
+  cgra_matmul(a_allocated, a_aligned, a_offset, a_size0, a_size1, a_stride0, a_stride1,
+              b_allocated, b_aligned, b_offset, b_size0, b_size1, b_stride0, b_stride1,
+              c_allocated, c_aligned, c_offset, c_size0, c_size1, c_stride0, c_stride1);
+
+}
+
+extern "C" void cgra_batch_matmul(float* a_allocated, float* a_aligned, int64_t a_offset, int64_t a_size0, int64_t a_size1, int64_t a_size2, int64_t a_stride0, int64_t a_stride1, int64_t a_stride2,
+                            float* b_allocated, float* b_aligned, int64_t b_offset, int64_t b_size0, int64_t b_size1, int64_t b_size2, int64_t b_stride0, int64_t b_stride1, int64_t b_stride2,
+                            float* c_allocated, float* c_aligned, int64_t c_offset, int64_t c_size0, int64_t c_size1, int64_t c_size2, int64_t c_stride0, int64_t c_stride1, int64_t c_stride2) {
+
+  // prepare inputs
+  vector<int64_t> a_sizes = {a_size0, a_size1, a_size2};
+  vector<int64_t> a_strides = {a_stride0, a_stride1, a_stride2};
+  MemRef memRef0(a_allocated, a_aligned, a_offset, a_sizes, a_strides, 3);
+
+  vector<int64_t> b_sizes = {b_size0, b_size1, b_size2};
+  vector<int64_t> b_strides = {b_stride0, b_stride1, b_stride2};
+  MemRef memRef1(b_allocated, b_aligned, b_offset, b_sizes, b_strides, 3);
+
+  DataReq input;
+  input.assembleReq(memRef0);
+  input.assembleReq(memRef1);
+
+  // prepare outputs
+  vector<int64_t> c_sizes = {c_size0, c_size1, c_size2};
+  vector<int64_t> c_strides = {c_stride0, c_stride1, c_stride2};
+  MemRef memRef2(c_allocated, c_aligned, c_offset, c_sizes, c_strides, 3);
+
+  DataReq output;
+  output.assembleReq(memRef2);
+
+  // issue READ/EXECUTE/WRITE requests for simulation
+  cgra->issueRD(input);
+  cgra->issueEX("batch_matmul");
+  cgra->issueWR(output, true);
+}
+
+extern "C" void cgra_batch_matmul_(float* a_allocated, float* a_aligned, int64_t a_offset, int64_t a_size0, int64_t a_size1, int64_t a_size2, int64_t a_stride0, int64_t a_stride1, int64_t a_stride2,
+                            float* b_allocated, float* b_aligned, int64_t b_offset, int64_t b_size0, int64_t b_size1, int64_t b_size2, int64_t b_stride0, int64_t b_stride1, int64_t b_stride2,
+                            float* c_allocated, float* c_aligned, int64_t c_offset, int64_t c_size0, int64_t c_size1, int64_t c_size2, int64_t c_stride0, int64_t c_stride1, int64_t c_stride2) {
+
+  // prepare inputs
+  vector<int64_t> a_sizes = {a_size0, a_size1, a_size2};
+  vector<int64_t> a_strides = {a_stride0, a_stride1, a_stride2};
+  MemRef memRef0(a_allocated, a_aligned, a_offset, a_sizes, a_strides, 3);
+
+  vector<int64_t> b_sizes = {b_size0, b_size1, b_size2};
+  vector<int64_t> b_strides = {b_stride0, b_stride1, b_stride2};
+  MemRef memRef1(b_allocated, b_aligned, b_offset, b_sizes, b_strides, 3);
+
+  DataReq input;
+  input.assembleReq(memRef0);
+  input.assembleReq(memRef1);
+
+  // prepare outputs
+  vector<int64_t> c_sizes = {c_size0, c_size1, c_size2};
+  vector<int64_t> c_strides = {c_stride0, c_stride1, c_stride2};
+  MemRef memRef2(c_allocated, c_aligned, c_offset, c_sizes, c_strides, 3);
+
+  DataReq output;
+  output.assembleReq(memRef2);
+
+  // issue READ/EXECUTE/WRITE requests for simulation
+  cgra->issueRD(input);
+  cgra->issueEX("batch_matmul");
+  cgra->issueWR(output, true);
+
+  cout<<"calculated output for cgra_batch_matmul() a_alloc: "<<a_allocated<<"; a_aligned: "<<a_aligned<<"; a_offset: "<<a_offset<<"; a_size0: "<<a_size0<<"; a_size1: "<<a_size1<<"; a_size2: "<<a_size2<<"; a_stride0: "<<a_stride0<<"; a_stride1: "<<a_stride1<<"; a_stride2: "<<a_stride2<<endl;
+  cout<<"calculated output for cgra_batch_matmul() b_alloc: "<<b_allocated<<"; b_aligned: "<<b_aligned<<"; b_offset: "<<b_offset<<"; b_size0: "<<b_size0<<"; b_size1: "<<b_size1<<"; b_size2: "<<b_size2<<"; b_stride0: "<<b_stride0<<"; b_stride1: "<<b_stride1<<"; b_stride2: "<<b_stride2<<endl;
+  cout<<"calculated output for cgra_batch_matmul() c_alloc: "<<c_allocated<<"; c_aligned: "<<c_aligned<<"; c_offset: "<<c_offset<<"; c_size0: "<<c_size0<<"; c_size1: "<<c_size1<<"; c_size2: "<<c_size2<<"; c_stride0: "<<c_stride0<<"; c_stride1: "<<c_stride1<<"; c_stride2: "<<c_stride2<<endl;
+  cout<<"check total cycles: "<<cgra->getTotalCycles()<<endl;
+}
+
 // This fusion is an example for add+max+add. A robust fusion call should
 // be able to figure out what type of operation chain is targeted.
 extern "C" void cgra_fusion_add_max_add(float* a_allocated, float* a_aligned, int64_t a_offset, int64_t a_size0, int64_t a_size1, int64_t a_stride0, int64_t a_stride1,
diff --git a/sim/Simulator.cpp b/sim/Simulator.cpp
@@ -13,8 +13,10 @@ Simulator::Simulator(bool enableDoubleBuffer) {
 
 void Simulator::registerPredefinedMappings() {
   exCycleMap.insert({"matmul", 20});
+  exCycleMap.insert({"batch_matmul", 20});
   exCycleMap.insert({"fusion_add_max_add", 20});
   exFuncMap["matmul"] = matmul;
+  exFuncMap["batch_matmul"] = batch_matmul;
   exFuncMap["fusion_add_max_add"] = fusion_add_max_add;
 }
 
diff --git a/tools/soda-opt/soda-opt.cpp b/tools/soda-opt/soda-opt.cpp

Original file line number	Diff line number	Diff line change
`@@ -13,8 +13,10 @@ Simulator::Simulator(bool enableDoubleBuffer) {`
`13`	`13`
`14`	`14`	`void Simulator::registerPredefinedMappings() {`
`15`	`15`	`exCycleMap.insert({"matmul", 20});`
	`16`	`+ exCycleMap.insert({"batch_matmul", 20});`
`16`	`17`	`exCycleMap.insert({"fusion_add_max_add", 20});`
`17`	`18`	`exFuncMap["matmul"] = matmul;`
	`19`	`+ exFuncMap["batch_matmul"] = batch_matmul;`
`18`	`20`	`exFuncMap["fusion_add_max_add"] = fusion_add_max_add;`
`19`	`21`	`}`
`20`	`22`