fix correctness issue

yifeizh2 · yifeizh2 · commit edc6cc04cc65 · 2024-07-08T00:30:48.000-07:00
diff --git a/lib/gc/Transforms/FlashAttentionConversion.cpp b/lib/gc/Transforms/FlashAttentionConversion.cpp
@@ -162,9 +162,12 @@ struct MHAToFlashAttention
         rewriter.create<linalg::FillOp>(loc, minusInf, maxSlice).getResult(0);
     Value sumSliceFilled =
         rewriter.create<linalg::FillOp>(loc, zero, sumSlice).getResult(0);
+    Value collapsedOSliceFilled =
+        rewriter.create<linalg::FillOp>(loc, zero, collapsedOSlice)
+            .getResult(0);
     // create the innermost for loop for columnBlock
     SmallVector<Value> innermostDestinationTensors{
-        collapsedOSlice, maxSliceFilled, sumSliceFilled};
+        collapsedOSliceFilled, maxSliceFilled, sumSliceFilled};
     auto columnBlockLoop = rewriter.create<scf::ForOp>(
         loc,
         getValueOrCreateConstantIndexOp(
@@ -241,9 +244,9 @@ struct MHAToFlashAttention
                     ValueRange args) {
                   Value constant = nestedBuilder.create<arith::ConstantOp>(
                       loc, nestedBuilder.getFloatAttr(dtype, rsqrtHead));
-                  Value added = nestedBuilder.create<arith::MulFOp>(
+                  Value scaled = nestedBuilder.create<arith::MulFOp>(
                       loc, args[0], constant);
-                  nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
+                  nestedBuilder.create<linalg::YieldOp>(nestedLoc, scaled);
                 })
             .getResult(0);
     Value add = rewriter
@@ -338,22 +341,32 @@ struct MHAToFlashAttention
                                       ValueRange{PSlice, collapsedVSlice},
                                       ValueRange{matmulVOutFilled})
             .getResult(0);
-    Value expMaxDiffRecip =
-        rewriter
-            .create<linalg::ReciprocalOp>(loc, reducedShapeOut.getType(),
-                                          ValueRange{expMaxDiff},
-                                          ValueRange{reducedShapeOut})
-            .getResult(0);
-    Value expMaxDiffRecipBroadcasted =
+    Value expMaxDiffBroadcasted =
         rewriter
-            .create<linalg::BroadcastOp>(loc, expMaxDiffRecip, VShapeOut,
+            .create<linalg::BroadcastOp>(loc, expMaxDiff, VShapeOut,
                                          SmallVector<int64_t>{1})
             .getResults()[0];
+    Value expMaxDiffBroadcastedEps =
+        rewriter
+            .create<linalg::GenericOp>(
+                loc, VShapeOut.getType(), ValueRange{expMaxDiffBroadcasted},
+                ValueRange{VShapeOut}, indexingMaps,
+                SmallVector<utils::IteratorType>(2,
+                                                 utils::IteratorType::parallel),
+                [&](OpBuilder &nestedBuilder, Location nestedLoc,
+                    ValueRange args) {
+                  Value eps = nestedBuilder.create<arith::ConstantOp>(
+                      loc, nestedBuilder.getFloatAttr(dtype, 1e-9));
+                  Value added =
+                      nestedBuilder.create<arith::AddFOp>(loc, args[0], eps);
+                  nestedBuilder.create<linalg::YieldOp>(nestedLoc, added);
+                })
+            .getResult(0);
     Value rescaledOSlice =
         rewriter
-            .create<linalg::MulOp>(
+            .create<linalg::DivOp>(
                 loc, VShapeOut.getType(),
-                ValueRange{prevOSlice, expMaxDiffRecipBroadcasted},
+                ValueRange{prevOSlice, expMaxDiffBroadcastedEps},
                 ValueRange{VShapeOut})
             .getResult(0);
     Value newOSlice =
@@ -372,25 +385,19 @@ struct MHAToFlashAttention
           sumSliceFinal = innermostLoopResults[2];
     Value sliceShapeOut =
         rewriter.create<tensor::EmptyOp>(loc, reducedShape, dtype);
-    Value sumSliceFinalRecip =
-        rewriter
-            .create<linalg::ReciprocalOp>(loc, sliceShapeOut.getType(),
-                                          ValueRange{sumSliceFinal},
-                                          ValueRange{sliceShapeOut})
-            .getResult(0);
     Value broadcastedSliceShapeOut =
         rewriter.create<tensor::EmptyOp>(loc, VShape, dtype);
-    Value sumSliceFinalRecipBroadcasted =
+    Value sumSliceFinalBroadcasted =
         rewriter
-            .create<linalg::BroadcastOp>(loc, sumSliceFinalRecip,
+            .create<linalg::BroadcastOp>(loc, sumSliceFinal,
                                          broadcastedSliceShapeOut,
                                          SmallVector<int64_t>{1})
             .getResults()[0];
     Value rescaledOSliceFinal =
         rewriter
-            .create<linalg::MulOp>(
+            .create<linalg::DivOp>(
                 loc, broadcastedSliceShapeOut.getType(),
-                ValueRange{sumSliceFinalRecipBroadcasted, OSliceFinal},
+                ValueRange{OSliceFinal, sumSliceFinalBroadcasted},
                 ValueRange{broadcastedSliceShapeOut})
             .getResult(0);
     SmallVector<OpFoldResult> outputOffsets;
diff --git a/test/gc/Transform/flashAttention.mlir b/test/gc/Transform/flashAttention.mlir
@@ -1,31 +1,31 @@
 // RUN: gc-opt --split-input-file --flash-attention-conversion --gc-cpu-pipeline %s | gc-cpu-runner -e main -entry-point-result=void
+// | FileCheck --allow-empty
 
-func.func @flash_attention(%arg0: tensor<1x16x384x64xf32>, %arg1: tensor<1x16x384x64xf32>, %arg2: tensor<1x16x384x64xf32>, %arg3: tensor<1x16x384x384xf32>) -> tensor<1x16x384x64xf32> {
-    %0 = tensor.empty() : tensor<1x16x384x64xf32>
-    %1 = linalgx.scaled_dot_product_attention ins(%arg0, %arg1, %arg2, %arg3: tensor<1x16x384x64xf32>, tensor<1x16x384x64xf32>, tensor<1x16x384x64xf32>, tensor<1x16x384x384xf32>) outs(%0 : tensor<1x16x384x64xf32>)  -> tensor<1x16x384x64xf32>
-    return %1 : tensor<1x16x384x64xf32>
+func.func @flash_attention(%arg0: tensor<4x4x384x64xf32>, %arg1: tensor<4x4x384x64xf32>, %arg2: tensor<4x4x384x64xf32>, %arg3: tensor<4x4x384x384xf32>) -> tensor<4x4x384x64xf32> {
+    %0 = tensor.empty() : tensor<4x4x384x64xf32>
+    %1 = linalgx.scaled_dot_product_attention ins(%arg0, %arg1, %arg2, %arg3: tensor<4x4x384x64xf32>, tensor<4x4x384x64xf32>, tensor<4x4x384x64xf32>, tensor<4x4x384x384xf32>) outs(%0 : tensor<4x4x384x64xf32>)  -> tensor<4x4x384x64xf32>
+    return %1 : tensor<4x4x384x64xf32>
 }
 
 func.func @main() {
-  %cst = arith.constant 1.000000e+00 : f32
+  %cst = arith.constant 4.000000e+00 : f32
 
-  %QKVShape  = tensor.empty() : tensor<1x16x384x64xf32>
-  %maskShape  = tensor.empty() : tensor<1x16x384x384xf32>
+  %QKVShape  = tensor.empty() : tensor<4x4x384x64xf32>
+  %maskShape  = tensor.empty() : tensor<4x4x384x384xf32>
 
-  %Q = linalg.fill ins(%cst : f32) outs(%QKVShape : tensor<1x16x384x64xf32>) -> tensor<1x16x384x64xf32>
-  %K = linalg.fill ins(%cst : f32) outs(%QKVShape : tensor<1x16x384x64xf32>) -> tensor<1x16x384x64xf32>
-  %V = linalg.fill ins(%cst : f32) outs(%QKVShape : tensor<1x16x384x64xf32>) -> tensor<1x16x384x64xf32>
-  %mask = linalg.fill ins(%cst : f32) outs(%maskShape : tensor<1x16x384x384xf32>) -> tensor<1x16x384x384xf32>
+  %Q = linalg.fill ins(%cst : f32) outs(%QKVShape : tensor<4x4x384x64xf32>) -> tensor<4x4x384x64xf32>
+  %K = linalg.fill ins(%cst : f32) outs(%QKVShape : tensor<4x4x384x64xf32>) -> tensor<4x4x384x64xf32>
+  %V = linalg.fill ins(%cst : f32) outs(%QKVShape : tensor<4x4x384x64xf32>) -> tensor<4x4x384x64xf32>
+  %mask = linalg.fill ins(%cst : f32) outs(%maskShape : tensor<4x4x384x384xf32>) -> tensor<4x4x384x384xf32>
 
   %out = func.call @flash_attention(%Q, %K, %V, %mask) :
-  (tensor<1x16x384x64xf32>, tensor<1x16x384x64xf32>, tensor<1x16x384x64xf32>, tensor<1x16x384x384xf32>)
-    -> (tensor<1x16x384x64xf32>)
+  (tensor<4x4x384x64xf32>, tensor<4x4x384x64xf32>, tensor<4x4x384x64xf32>, tensor<4x4x384x384xf32>)
+    -> (tensor<4x4x384x64xf32>)
 
   %idx = arith.constant 0 : index
-  %val = tensor.extract %out[%idx, %idx, %idx, %idx] : tensor<1x16x384x64xf32>
-  cpuruntime.printf "output[0, 0, 0]: %f\n" %val : f32
+  %val = tensor.extract %out[%idx, %idx, %idx, %idx] : tensor<4x4x384x64xf32>
+  cpuruntime.printf "output[0, 0, 0, 0]: %f\n" %val : f32
 
   return
 }
-// CHECK: output[0, 0, 0]: 1.0
-
+// CHECK: output[0, 0, 0]: 4.0