buddy-compiler · FloatingcloudKnight · Oct 1, 2025
diff --git a/examples/BuddyNext/makefile b/examples/BuddyNext/makefile
@@ -1235,54 +1235,65 @@ next-compass-run:
 		-shared-libs=${MLIR_RUNNER_UTILS} \
 		-shared-libs=${MLIR_C_RUNNER_UTILS}
 
-tosa-matmul-transpose2-lower:
-	@${BUDDY_OPT} ./tosa-matmultranspose2.mlir \
+next-matmul-transpose2-lower:
+	@${BUDDY_OPT} ./next-matmul-transpose2.mlir \
 			-transpose-fusion-vectorization \
 			-o log.mlir
 
-tosa-matmul-transpose2-run:
-	@${BUDDY_OPT} ./tosa-matmultranspose2.mlir \
+next-matmul-transpose2-run:
+	@${BUDDY_OPT} ./next-matmul-transpose2.mlir \
 			-pass-pipeline "builtin.module(transpose-fusion-vectorization, func.func(tosa-to-linalg-named, tosa-to-linalg, tosa-to-tensor, tosa-to-arith))" | \
     ${BUDDY_OPT} \
+		-convert-elementwise-to-linalg \
+		-arith-expand \
 		-eliminate-empty-tensors \
-		-convert-tensor-to-linalg \
-		-linalg-bufferize \
+		-empty-tensor-to-alloc-tensor \
+		-one-shot-bufferize="bufferize-function-boundaries" \
 		-convert-linalg-to-affine-loops \
-		-lower-affine \
-		-func-bufferize \
-		-arith-bufferize \
-		-tensor-bufferize \
-		-buffer-deallocation \
-		-finalizing-bufferize \
+		-affine-loop-fusion \
+		-affine-parallelize \
+		-convert-scf-to-openmp \
 		-convert-vector-to-scf \
 		-expand-strided-metadata \
+		-lower-affine \
+		-cse \
 		-convert-vector-to-llvm \
+		-memref-expand \
+		-arith-expand \
 		-convert-arith-to-llvm \
 		-finalize-memref-to-llvm \
 		-convert-scf-to-cf \
+		-convert-cf-to-llvm \
+		-convert-openmp-to-llvm \
 		-convert-arith-to-llvm \
+		-convert-math-to-llvm \
+		-convert-math-to-libm \
 		-convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
 		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
 
-tosa-matmul-transpose2-vec-run:
-	@${BUDDY_OPT} ./tosa-matmultranspose2-vec.mlir\
+next-matmul-transpose2-vec-run:
+	@${BUDDY_OPT} ./next-matmul-transpose2-vec.mlir \
 		-convert-linalg-to-affine-loops \
+		-affine-loop-fusion \
 		-affine-parallelize \
-		-lower-affine \
 		-convert-scf-to-openmp \
 		-convert-vector-to-scf \
 		-expand-strided-metadata \
+		-lower-affine \
+		-cse \
 		-convert-vector-to-llvm \
 		-memref-expand \
 		-arith-expand \
 		-convert-arith-to-llvm \
-		-finalize-memref-to-llvm  \
+		-finalize-memref-to-llvm \
 		-convert-scf-to-cf \
+		-convert-cf-to-llvm \
 		-convert-openmp-to-llvm \
+		-convert-arith-to-llvm \
 		-convert-math-to-llvm \
-		-convert-math-to-libm  \
+		-convert-math-to-libm \
 		-convert-func-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \

diff --git a/examples/BuddyNext/next-matmul-transpose2-vec.mlir b/examples/BuddyNext/next-matmul-transpose2-vec.mlir
@@ -2,15 +2,13 @@
 // RUN:     -convert-linalg-to-affine-loops \
 // RUN:     -lower-affine \
 // RUN:     -convert-vector-to-scf \
+// RUN:     -expand-strided-metadata \
 // RUN:     -convert-scf-to-cf \
 // RUN:     -convert-cf-to-llvm \
 // RUN:     -convert-vector-to-llvm \
-// RUN:     -convert-math-to-llvm \
-// RUN:     -convert-math-to-libm \
+// RUN:     -finalize-memref-to-llvm \
 // RUN:     -convert-arith-to-llvm \
 // RUN:     -convert-func-to-llvm \
-// RUN:     -expand-strided-metadata \
-// RUN:     -finalize-memref-to-llvm \
 // RUN:     -reconcile-unrealized-casts \
 // RUN: | mlir-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
@@ -28,21 +26,21 @@ func.func @test(%a : memref<?x?x?xf32>, %b : memref<?x?x?xf32>, %c : memref<?x?x
   %vl_step = arith.constant 32 : index
   %c0_f32 = arith.constant 0.000000e+00 : f32
   %v0 = vector.splat %c0_f32 : vector<32xf32>
-  %dim = memref.dim %a, %c0 : memref<?x?x?xf32>        //
+  %dim = memref.dim %a, %c0 : memref<?x?x?xf32>        // 
   %dim_0 = memref.dim %a, %c1 : memref<?x?x?xf32>
   %dim_1 = memref.dim %a, %c2 : memref<?x?x?xf32>
   %dim_2 = memref.dim %b, %c2 : memref<?x?x?xf32>
 
   // Calculate the upper bound for vectorized processing
   // - Subtract `vl_step` is to avoid overflow at the vectorization tail.
-  // - Add 1 to ensure the final loop runs when the workload length
+  // - Add 1 to ensure the final loop runs when the workload length 
   //   is divisible by the vector size.
   %dim_2_upbound_tmp = arith.subi %dim_2, %vl_step : index
   %dim_2_upbound = arith.addi %dim_2_upbound_tmp, %c1 : index
 
   affine.for %arg3 = %c0 to %dim {
     affine.for %arg4 = %c0 to %dim_0 {
-      %iter_idx = scf.for %arg5 = %c0 to %dim_2_upbound
+      %iter_idx = scf.for %arg5 = %c0 to %dim_2_upbound 
           step %vl_step iter_args(%iter_init = %c0) -> (index){
         %0 = vector.load %c[%arg4, %arg3, %arg5] : memref<?x?x?xf32>, vector<32xf32>
         %iter_value = scf.for %arg6 = %c0 to %dim_1 step %c1 iter_args(%value_init = %0) -> (vector<32xf32>){
@@ -57,9 +55,9 @@ func.func @test(%a : memref<?x?x?xf32>, %b : memref<?x?x?xf32>, %c : memref<?x?x
         scf.yield %idx_next : index
       }
 
-      // Compute the tail size and Process the remaining elements
+      // Compute the tail size and Process the remaining elements 
       // using masked vector operations.
-      %tail_size = arith.subi %dim_1, %iter_idx : index
+      %tail_size = arith.subi %dim_2, %iter_idx : index
       %mask = vector.create_mask %tail_size : vector<32xi1>
       %0 = vector.maskedload %c[%arg4, %arg3, %iter_idx], %mask, %v0 : memref<?x?x?xf32>, vector<32xi1>, vector<32xf32> into vector<32xf32>
       %iter_value = scf.for %arg6 = %c0 to %dim_1 step %c1 iter_args(%value_init = %0) -> (vector<32xf32>){
@@ -72,7 +70,7 @@ func.func @test(%a : memref<?x?x?xf32>, %b : memref<?x?x?xf32>, %c : memref<?x?x
       vector.maskedstore %c[%arg4, %arg3, %iter_idx], %mask, %iter_value : memref<?x?x?xf32>, vector<32xi1>, vector<32xf32>
     }
   }
-
+  
   %t_end = call @rtclock() : () -> f64
   %time = arith.subf %t_end, %t_start : f64
   // Print timings.
@@ -111,8 +109,9 @@ func.func @main(){
 
   %printed_m2 = memref.cast %m2 : memref<?x?x?xf32> to memref<*xf32>
 
-  // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [40, 32, 128] strides = [4096, 128, 1] data =
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [40, 32, 128] strides = [4096, 128, 1] data = 
   // CHECK-NEXT: [
+  // CHECK: [
   // CHECK: [240{{(, 240)*}}]
   call @printMemrefF32(%printed_m2) : (memref<*xf32>) -> ()
 

diff --git a/examples/BuddyNext/next-matmul-transpose2.mlir b/examples/BuddyNext/next-matmul-transpose2.mlir
@@ -1,21 +1,32 @@
 // RUN: buddy-opt %s \
 // RUN:     -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \
 // RUN: | buddy-opt \
+// RUN:     -convert-elementwise-to-linalg \
+// RUN:     -arith-expand \
 // RUN:     -eliminate-empty-tensors \
-// RUN:     -convert-tensor-to-linalg \
+// RUN:     -empty-tensor-to-alloc-tensor \
 // RUN:     -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN:     -convert-linalg-to-affine-loops \
-// RUN:     -lower-affine \
+// RUN:     -affine-loop-fusion \
+// RUN:     -affine-parallelize \
+// RUN:     -convert-scf-to-openmp \
 // RUN:     -convert-vector-to-scf \
 // RUN:     -expand-strided-metadata \
-// RUN:     -convert-vector-to-llvm \
-// RUN:     -convert-arith-to-llvm \
-// RUN:     -finalize-memref-to-llvm \
-// RUN:     -convert-scf-to-cf \
-// RUN:     -convert-cf-to-llvm \
-// RUN:     -convert-arith-to-llvm \
-// RUN:     -convert-func-to-llvm \
-// RUN:     -reconcile-unrealized-casts \
+// RUN:     -lower-affine \
+// RUN: 		-cse \
+// RUN: 		-convert-vector-to-llvm \
+// RUN: 		-memref-expand \
+// RUN: 		-arith-expand \
+// RUN: 		-convert-arith-to-llvm \
+// RUN: 		-finalize-memref-to-llvm \
+// RUN: 		-convert-scf-to-cf \
+// RUN: 		-convert-cf-to-llvm \
+// RUN: 		-convert-openmp-to-llvm \
+// RUN: 		-convert-arith-to-llvm \
+// RUN: 		-convert-math-to-llvm \
+// RUN: 		-convert-math-to-libm \
+// RUN: 		-convert-func-to-llvm \
+// RUN: 		-reconcile-unrealized-casts \
 // RUN: | mlir-runner -e main -entry-point-result=void \
 // RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \
 // RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
@@ -32,7 +43,7 @@ func.func @test(%a : tensor<1x40x32x128xf32>, %b : tensor<32x40x40xf32>) -> (ten
     %3 = tosa.matmul %b, %2 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32>
     %4 = tosa.reshape %3 {new_shape = array<i64: 1, 32, 40, 128>} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32>
     %5 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32>
-    %6 = tosa.transpose %4, %5 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32>
+    %6 = tosa.transpose %4, %5 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32> 
     %t_end = call @rtclock() : () -> f64
     %time = arith.subf %t_end, %t_start : f64
     // Print timings.
@@ -44,8 +55,6 @@ func.func @main(){
 
   %v2 = arith.constant dense<2.0> : tensor<32x40x40xf32>
   %v3 = arith.constant dense<3.0> : tensor<1x40x32x128xf32>
-  // %m0 = tensor.cast %v2 : tensor<32x40x40xf32> to tensor<?x?x?xf32>
-  // %m1 = tensor.cast %v3 : tensor<40x32x128xf32> to tensor<?x?x?xf32>
 
   %m2 = call @test(%v3, %v2) : (tensor<1x40x32x128xf32>, tensor<32x40x40xf32>) -> (tensor<1x40x32x128xf32>)
 

diff --git a/midend/lib/Conversion/TransposeOptimization/CMakeLists.txt b/midend/lib/Conversion/TransposeOptimization/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_library(TransposeOptimization
   BuiltinTransposeVectorization.cpp
+  TransposeFusionVectorization.cpp
   LINK_LIBS PUBLIC
   BuddyUtils
 )