buddy-compiler
diff --git a/‎examples/BuddyBert/CMakeLists.txt‎
Lines changed: 5 additions & 1 deletion b/‎examples/BuddyBert/CMakeLists.txt‎
Lines changed: 5 additions & 1 deletion
diff --git a/‎examples/BuddyDeepSeekR1/CMakeLists.txt‎
Lines changed: 14 additions & 0 deletions b/‎examples/BuddyDeepSeekR1/CMakeLists.txt‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎examples/BuddyLeNet/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions b/‎examples/BuddyLeNet/CMakeLists.txt‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎examples/BuddyLlama/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions b/‎examples/BuddyLlama/CMakeLists.txt‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎examples/BuddyMobileNetV3/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions b/‎examples/BuddyMobileNetV3/CMakeLists.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎examples/BuddyResNet18/CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion b/‎examples/BuddyResNet18/CMakeLists.txt‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/BuddyStableDiffusion/CMakeLists.txt‎
Lines changed: 23 additions & 4 deletions b/‎examples/BuddyStableDiffusion/CMakeLists.txt‎
Lines changed: 23 additions & 4 deletions
diff --git a/‎examples/BuddyWhisper/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions b/‎examples/BuddyWhisper/CMakeLists.txt‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎examples/VIRDialect/.gitignore‎
Lines changed: 3 additions & 1 deletion b/‎examples/VIRDialect/.gitignore‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎examples/VIRDialect/fixed-kernels-gen.sh‎
Lines changed: 48 additions & 0 deletions b/‎examples/VIRDialect/fixed-kernels-gen.sh‎
Lines changed: 48 additions & 0 deletions
@@ -33,11 +33,15 @@ add_custom_command(
           -eliminate-empty-tensors
           -empty-tensor-to-alloc-tensor
           -one-shot-bufferize="bufferize-function-boundaries"
+          -ownership-based-buffer-deallocation
+          -buffer-deallocation-simplification
+          -bufferization-lower-deallocations
           -matmul-parallel-vectorization-optimize
           -batchmatmul-optimize
           -convert-linalg-to-affine-loops
           -affine-loop-fusion
           -affine-parallelize
+          -lower-affine
           -convert-scf-to-openmp
           -convert-vector-to-scf
           -expand-strided-metadata
@@ -81,5 +85,5 @@ target_compile_definitions(buddy-bert-run PRIVATE
 
 target_link_directories(buddy-bert-run PRIVATE ${LLVM_LIBRARY_DIR})
 
-set(BUDDY_BERT_LIBS BERT mlir_c_runner_utils)
+set(BUDDY_BERT_LIBS BERT mlir_c_runner_utils omp)
 target_link_libraries(buddy-bert-run ${BUDDY_BERT_LIBS})
@@ -26,6 +26,9 @@ add_custom_command(
             -eliminate-empty-tensors
             -empty-tensor-to-alloc-tensor
             -one-shot-bufferize="bufferize-function-boundaries"
+            -ownership-based-buffer-deallocation
+            -buffer-deallocation-simplification
+            -bufferization-lower-deallocations
             -matmul-parallel-vectorization-optimize
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
@@ -67,11 +70,15 @@ add_custom_command(
             -eliminate-empty-tensors
             -empty-tensor-to-alloc-tensor
             -one-shot-bufferize="bufferize-function-boundaries"
+            -ownership-based-buffer-deallocation
+            -buffer-deallocation-simplification
+            -bufferization-lower-deallocations
             -matmul-parallel-vectorization-optimize
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
             -affine-loop-fusion
             -affine-parallelize
+            -lower-affine
             -convert-scf-to-openmp
             -func-bufferize-dynamic-offset
             -convert-vector-to-scf
@@ -109,6 +116,9 @@ add_custom_command(
             -eliminate-empty-tensors
             -empty-tensor-to-alloc-tensor
             -one-shot-bufferize="bufferize-function-boundaries"
+            -ownership-based-buffer-deallocation
+            -buffer-deallocation-simplification
+            -bufferization-lower-deallocations
             -matmul-parallel-vectorization-optimize
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
@@ -150,11 +160,15 @@ add_custom_command(
             -eliminate-empty-tensors
             -empty-tensor-to-alloc-tensor
             -one-shot-bufferize="bufferize-function-boundaries"
+            -ownership-based-buffer-deallocation
+            -buffer-deallocation-simplification
+            -bufferization-lower-deallocations
             -matmul-parallel-vectorization-optimize
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
             -affine-loop-fusion
             -affine-parallelize
+            -lower-affine
             -convert-scf-to-openmp
             -func-bufferize-dynamic-offset
             -convert-vector-to-scf
 
@@ -29,6 +29,9 @@ add_custom_command(
             -eliminate-empty-tensors
             -convert-tensor-to-linalg
             -one-shot-bufferize="bufferize-function-boundaries"
+            -ownership-based-buffer-deallocation
+            -buffer-deallocation-simplification
+            -bufferization-lower-deallocations
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
             -func-bufferize-dynamic-offset
@@ -98,6 +101,9 @@ add_custom_command(
           ${BUDDY_BINARY_DIR}/buddy-opt
           -one-shot-bufferize="bufferize-function-boundaries"
           -func-bufferize-dynamic-offset
+          -ownership-based-buffer-deallocation
+          -buffer-deallocation-simplification
+          -bufferization-lower-deallocations
           -convert-linalg-to-parallel-loops
           -canonicalize
           -gpu-map-parallel-loops
 
@@ -17,6 +17,9 @@ add_custom_command(
             -eliminate-empty-tensors
             -empty-tensor-to-alloc-tensor
             -one-shot-bufferize="bufferize-function-boundaries"
+            -ownership-based-buffer-deallocation
+            -buffer-deallocation-simplification
+            -bufferization-lower-deallocations
             -matmul-parallel-vectorization-optimize
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
@@ -58,11 +61,15 @@ add_custom_command(
             -eliminate-empty-tensors
             -empty-tensor-to-alloc-tensor
             -one-shot-bufferize="bufferize-function-boundaries"
+            -ownership-based-buffer-deallocation
+            -buffer-deallocation-simplification
+            -bufferization-lower-deallocations
             -matmul-parallel-vectorization-optimize
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
             -affine-loop-fusion
             -affine-parallelize
+            -lower-affine
             -convert-scf-to-openmp
             -convert-vector-to-scf
             -expand-strided-metadata
 
@@ -38,6 +38,9 @@ add_custom_command(
           ${BUDDY_BINARY_DIR}/buddy-opt
             -convert-elementwise-to-linalg
             -one-shot-bufferize="bufferize-function-boundaries"
+            -ownership-based-buffer-deallocation
+            -buffer-deallocation-simplification
+            -bufferization-lower-deallocations
             -func-bufferize-dynamic-offset
             -convert-linalg-to-loops
             -convert-scf-to-cf
 
@@ -38,7 +38,9 @@ add_custom_command(
           ${BUDDY_BINARY_DIR}/buddy-opt
           -convert-elementwise-to-linalg
           -one-shot-bufferize="bufferize-function-boundaries"
-          -func-bufferize-dynamic-offset
+          -ownership-based-buffer-deallocation
+          -buffer-deallocation-simplification
+          -bufferization-lower-deallocations
           -convert-linalg-to-loops
           -convert-scf-to-cf
           -convert-cf-to-llvm
 
@@ -1,6 +1,6 @@
 add_custom_command(
   OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/arg0_text_encoder.data
-         ${CMAKE_CURRENT_BINARY_DIR}arg1_text_encoder.data
+         ${CMAKE_CURRENT_BINARY_DIR}/arg1_text_encoder.data
          ${CMAKE_CURRENT_BINARY_DIR}/arg0_unet.data
          ${CMAKE_CURRENT_BINARY_DIR}/arg0_vae.data
          ${CMAKE_CURRENT_BINARY_DIR}/forward_text_encoder.mlir
@@ -23,6 +23,9 @@ add_custom_command(
             -eliminate-empty-tensors
             -empty-tensor-to-alloc-tensor
             -one-shot-bufferize="bufferize-function-boundaries"
+            -ownership-based-buffer-deallocation
+            -buffer-deallocation-simplification
+            -bufferization-lower-deallocations
             -matmul-parallel-vectorization-optimize
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
@@ -64,13 +67,17 @@ add_custom_command(
             -eliminate-empty-tensors
             -empty-tensor-to-alloc-tensor
             -one-shot-bufferize="bufferize-function-boundaries"
+            -ownership-based-buffer-deallocation
+            -buffer-deallocation-simplification
+            -bufferization-lower-deallocations
             -func-bufferize-dynamic-offset
             -conv-nhwc-fhwc-optimize
             -matmul-parallel-vectorization-optimize
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
             -affine-loop-fusion
             -affine-parallelize
+            -lower-affine
             -convert-scf-to-openmp
             -convert-vector-to-scf
             -expand-strided-metadata
@@ -106,7 +113,10 @@ add_custom_command(
             -arith-expand
             -eliminate-empty-tensors
             -empty-tensor-to-alloc-tensor
-            -one-shot-bufferize
+            -one-shot-bufferize="bufferize-function-boundaries"
+            -ownership-based-buffer-deallocation
+            -buffer-deallocation-simplification
+            -bufferization-lower-deallocations
             -matmul-parallel-vectorization-optimize
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
@@ -147,13 +157,16 @@ add_custom_command(
             -eliminate-empty-tensors
             -empty-tensor-to-alloc-tensor
             -one-shot-bufferize="bufferize-function-boundaries"
-            -func-bufferize-dynamic-offset
+            -ownership-based-buffer-deallocation
+            -buffer-deallocation-simplification
+            -bufferization-lower-deallocations
             -conv-nhwc-fhwc-optimize
             -matmul-parallel-vectorization-optimize
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
             -affine-loop-fusion
             -affine-parallelize
+            -lower-affine
             -convert-scf-to-openmp
             -convert-vector-to-scf
             -expand-strided-metadata
@@ -190,6 +203,9 @@ add_custom_command(
             -eliminate-empty-tensors
             -empty-tensor-to-alloc-tensor
             -one-shot-bufferize="bufferize-function-boundaries"
+            -ownership-based-buffer-deallocation
+            -buffer-deallocation-simplification
+            -bufferization-lower-deallocations
             -matmul-parallel-vectorization-optimize
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
@@ -231,13 +247,16 @@ add_custom_command(
             -eliminate-empty-tensors
             -empty-tensor-to-alloc-tensor
             -one-shot-bufferize="bufferize-function-boundaries"
-            -func-bufferize-dynamic-offset
+            -ownership-based-buffer-deallocation
+            -buffer-deallocation-simplification
+            -bufferization-lower-deallocations
             -conv-nhwc-fhwc-optimize
             -matmul-parallel-vectorization-optimize
             -batchmatmul-optimize
             -convert-linalg-to-affine-loops
             -affine-loop-fusion
             -affine-parallelize
+            -lower-affine
             -convert-scf-to-openmp
             -convert-vector-to-scf
             -expand-strided-metadata
 
@@ -32,11 +32,15 @@ add_custom_command(
               -convert-elementwise-to-linalg
               -empty-tensor-to-alloc-tensor
               -one-shot-bufferize="bufferize-function-boundaries"
+              -ownership-based-buffer-deallocation
+              -buffer-deallocation-simplification
+              -bufferization-lower-deallocations
               -matmul-parallel-vectorization-optimize
               -batchmatmul-optimize
               -convert-linalg-to-affine-loops
               -affine-loop-fusion
               -affine-parallelize
+              -lower-affine
               -convert-scf-to-openmp
               -func-bufferize-dynamic-offset
               -convert-linalg-to-loops
 
@@ -1,4 +1,6 @@
 log*
 core
-a.out
+*.s
+*.out
 *.elf
+*.csv
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# Usage: ./fixed-kernels-gen.sh
+# This script generates test cases for the AVX2 / AVX512 / ARM NEON kernels.
+
+set -u  # Avoid undefined variables, removed -e
+
+# ==== Configuration ====
+RUN=1  # 0=print only, 1=execute make
+
+FIXED_STEPS=(4 8 16 32 64 128 256 512 1024)
+
+SIZES=(4096 4098 131072 131074 4194304 4194306 67108864 67108866)
+
+FIXED_TARGET="vector-saxpy-fixed-aot"
+
+gen_cases () {
+  # Extract target name and steps array name
+  local target="$1"; shift
+  local steps_array_name="$1"; shift
+
+  # Initialize counter for test cases
+  local count=0
+
+  # Iterate through all data sizes
+  for size in "${SIZES[@]}"; do
+    # Iterate through all step sizes for current vectorization strategy
+    # Use indirect expansion to access the array
+    for step in $(eval "echo \${$steps_array_name[@]}"); do
+      # Build make command with current parameters
+      cmd="make $target STEP=$step SIZE=$size"
+      echo "$cmd"
+
+      # Execute command if RUN flag is set to 1
+      if [[ "$RUN" -eq 1 ]]; then
+        eval "$cmd"
+      fi
+
+      # Increment test case counter
+      ((count++))
+    done
+  done
+
+  # Print summary of generated test cases
+  echo "===> $target generated ${count} test cases in total"
+}
+
+gen_cases "$FIXED_TARGET" FIXED_STEPS
-Original file line number
+Diff line change
@@ @@ -1,4 +1,6 @@ @@
 log*
 core
 -a.out
 +*.s
 +*.out
 *.elf
 +*.csv