init sdpa op and flash attention pass

yifeizh2 · yifeizh2 · commit 9d7b57af9966 · 2024-06-27T00:03:52.000-07:00
diff --git a/include/gc/Dialect/Linalgx/LinalgxStructuredOps.td b/include/gc/Dialect/Linalgx/LinalgxStructuredOps.td
@@ -23,6 +23,9 @@ include "mlir/Interfaces/InferTypeOpInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpAsmInterface.td"
 
+class Linalgx_Op<string mnemonic, list<Trait> traits = []> :
+    Op<LinalgxDialect, mnemonic, traits>;
+
 // Base Tablegen class for Linalg ops.
 // Linalg ops that correspond to library calls operate on ShapedType as their
 // first operands. These may be optionally followed by non-view operands
@@ -312,4 +315,22 @@ def Linalgx_MultiBatchMatmulOp : LinalgxStructuredBase_Op<"multi_batch_matmul",
   }];
 }
 
+def Linalgx_ScaledDotProductAttentionOp
+ : Linalgx_Op<"scaled_dot_product_attention", 
+      [AttrSizedOperandSegments,
+      DeclareOpInterfaceMethods<AggregatedOpInterface, ["decomposeOperation"]>]> {
+  let summary = "Attention structure.";
+  let description = [{
+    Q, K, V, attention_mask.
+    Output = SoftMax(Q @ K.transpose(-2, -1) + attention_mask) @ V.
+  }];
+  let arguments = (ins
+      Variadic<TensorOrMemref>:$inputs,
+      Variadic<TensorOrMemref>:$outputs);
+  let results = (outs Variadic<TensorOrMemref>:$results);
+  let regions = (region AnyRegion:$region);
+
+  let hasVerifier = 1;
+}
+
 #endif // LINALGX_STRUCTURED_OPS
diff --git a/include/gc/Transforms/Passes.td b/include/gc/Transforms/Passes.td
@@ -34,7 +34,18 @@ def DeepTileContractionNamedOp
   ];
 }
 
-def GCCPUPipeline : Pass<"gc-cpu-pipeline"> {
+def FlashAttentionConversion
+    : Pass<"flash-attention-conversion", "func::FuncOp"> {
+  let summary = "Flash Attention Conversion";
+  let description =
+      [{The pass converts MHA to flash attention implementation.}];
+  let dependentDialects = [
+    "func::FuncDialect", "linalg::LinalgDialect", "scf::SCFDialect",
+    "tensor::TensorDialect"
+  ];
+}
+
+def GCCPUPipeline: Pass<"gc-cpu-pipeline"> {
   let summary = "All-in-one pipeline for GC for CPU";
   let dependentDialects = [
     "onednn_graph::OneDNNGraphDialect", "tensor::TensorDialect",
diff --git a/lib/gc/Dialect/Linalgx/LinalgxOps.cpp b/lib/gc/Dialect/Linalgx/LinalgxOps.cpp
@@ -9,6 +9,7 @@
 #include "gc/Dialect/Linalgx/LinalgxOps.h"
 #include "gc/Dialect/Linalgx/LinalgxDialect.h"
 #include "mlir/IR/OpImplementation.h"
+#include <utility>
 
 //===----------------------------------------------------------------------===//
 // Builder helper from mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -613,6 +614,58 @@ void MultiBatchMatmulOp::getEffects(
                         getDpsInits());
 }
 
+//===----------------------------------------------------------------------===//
+// ScaledDotProductAttentionOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult ScaledDotProductAttentionOp::verify() { return success(); }
+
+/// Given an N-dimensional tensor x, this method converts
+/// softmax(x) to the following sequence of operations:
+///
+/// 1. transpose ins[1]
+/// 2. matmul ins[0] @ 1
+///
+FailureOr<SmallVector<Value>>
+ScaledDotProductAttentionOp::decomposeOperation(OpBuilder &b) {
+  OpBuilder::InsertionGuard guard(b);
+  b.setInsertionPoint(*this);
+  Location loc = getLoc();
+  Value query = getInputs()[0], key = getInputs()[1], value = getInputs()[2],
+        mask = getInputs()[3];
+  auto dtype = cast<RankedTensorType>(query.getType()).getElementType();
+  auto shape = cast<RankedTensorType>(query.getType()).getShape();
+
+  SmallVector<int64_t> permutation{0, 1, 3, 2};
+  SmallVector<int64_t> transposeShape{shape[0], shape[1], shape[3], shape[2]};
+  auto transposeOut = b.create<tensor::EmptyOp>(loc, transposeShape, dtype);
+  auto transpose = b.create<linalg::TransposeOp>(
+      /*location=*/loc,
+      /*inputs=*/key,
+      /*outputs=*/transposeOut,
+      /*permutation=*/permutation);
+
+  SmallVector<int64_t> matmulQKShape{shape[0], shape[1], shape[2], shape[2]};
+  auto matmulQKOut = b.create<tensor::EmptyOp>(loc, matmulQKShape, dtype);
+  auto matmulQK = b.create<linalgx::MultiBatchMatmulOp>(
+      /*location=*/loc, matmulQKOut.getResult().getType(),
+      /*inputs=*/ValueRange{query, transpose->getResult(0)},
+      /*outputs=*/ValueRange{matmulQKOut.getResult()});
+
+  auto addOut = b.create<tensor::EmptyOp>(loc, matmulQKShape, dtype);
+  auto add = b.create<linalg::AddOp>(
+      /*location=*/loc, addOut.getResult().getType(),
+      /*inputs=*/ValueRange{matmulQK->getResult(0), mask},
+      /*outputs=*/ValueRange{addOut.getResult()});
+
+  auto matmulVOut = b.create<tensor::EmptyOp>(loc, shape, dtype);
+  auto matmulV = b.create<linalgx::MultiBatchMatmulOp>(
+      /*location=*/loc, matmulVOut.getResult().getType(),
+      /*inputs=*/ValueRange{add->getResult(0), value},
+      /*outputs=*/ValueRange{matmulVOut.getResult()});
+  return SmallVector<Value>{matmulV.getResults()[0]};
+}
+
 /////// Operations corresponding to library calls defined with Tablegen ////////
 
 #define GET_OP_CLASSES
diff --git a/lib/gc/Transforms/CMakeLists.txt b/lib/gc/Transforms/CMakeLists.txt
@@ -11,6 +11,7 @@ add_mlir_library(GCPasses
   OneDNNGraphToLinalg.cpp
   Pipeline.cpp
   DeepTileContractionNamedOp.cpp
+  FlashAttentionConversion.cpp
   Tiling.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/lib/gc/Transforms/FlashAttentionConversion.cpp b/lib/gc/Transforms/FlashAttentionConversion.cpp
@@ -0,0 +1,58 @@
+//===-- FlashAttentionConversion.cpp ----------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "./Tiling.hpp"
+#include "gc/Dialect/Arith/Utils/EasyBuild.h"
+#include "gc/Dialect/Linalgx/LinalgxOps.h"
+#include "gc/IR/EasyBuild.h"
+#include "gc/IR/EasyBuildSCF.h"
+#include "mlir/AsmParser/AsmParser.h"
+#include "mlir/Dialect/Affine/IR/AffineOps.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
+#include "mlir/Dialect/Linalg/Transforms/TilingInterfaceImpl.h"
+#include "mlir/Dialect/Linalg/Transforms/Transforms.h"
+#include "mlir/Dialect/Linalg/Utils/Utils.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/SCF/Transforms/TileUsingInterface.h"
+#include "mlir/Dialect/Tensor/Transforms/Transforms.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Operation.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/IR/Region.h"
+#include "mlir/IR/Visitors.h"
+#include "mlir/Interfaces/DestinationStyleOpInterface.h"
+#include "mlir/Interfaces/TilingInterface.h"
+#include "mlir/Parser/Parser.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include <iostream>
+
+#include "gc/Transforms/Passes.h"
+
+#include <llvm/Support/Debug.h>
+
+#include <memory>
+
+namespace mlir {
+namespace gc {
+#define GEN_PASS_DEF_FLASHATTENTIONCONVERSION
+#include "gc/Transforms/Passes.h.inc"
+
+namespace {
+struct FlashAttentionConversion
+    : public impl::FlashAttentionConversionBase<FlashAttentionConversion> {
+public:
+  void runOnOperation() final {
+    return;
+  }
+};
+
+} // namespace
+} // namespace gc
+} // namespace mlir