update: example/BuddySparse, add more comments,verify the correctness with FileCheck

meinie0826 · meinie0826 · commit f67e15a5ffd1 · 2025-05-20T08:41:00.000Z
diff --git a/examples/BuddyMatmul/linalg-batchmatmul-f32.mlir b/examples/BuddyMatmul/linalg-batchmatmul-f32.mlir
@@ -77,10 +77,11 @@ func.func @main(){
   %m4 = call @alloc_f32(%c1, %c1024, %c1000, %f3) : (index, index, index, f32) -> memref<?x?x?xf32>
   %m5 = call @alloc_f32(%c1, %c1, %c1000, %f0) : (index, index, index, f32) -> memref<?x?x?xf32>
 
-  // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [1, 1, 1000] strides = [1000, 1000, 1] data =
-  // CHECK-NEXT: [
-  // CHECK: [
-  // CHECK: [6144{{(, 6144)*}}]
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [4, 3] strides = [3, 1] data =
+  // CHECK-NEXT: [[12,   12,   12], 
+  // CHECK-NEXT:  [12,   12,   12],
+  // CHECK-NEXT:  [12,   12,   12],
+  // CHECK-NEXT:  [12,   12,   12]]
   call @batch_matmul(%m3, %m4, %m5) : (memref<?x?x?xf32>, memref<?x?x?xf32>, memref<?x?x?xf32>) -> ()
 
   return
diff --git a/examples/BuddySpMM/verify_spmm.py b/examples/BuddySpMM/verify_spmm.py
diff --git a/examples/BuddySparse/linalg-spmm-f32.mlir b/examples/BuddySparse/linalg-spmm-f32.mlir
@@ -16,45 +16,74 @@
 // RUN:     -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
 // RUN: | FileCheck %s
 
+
+// External functions for utilities
+// printMemrefF32: Prints contents of a float32 memref buffer
+// rtclock: Returns current time for performance measurement
 func.func private @printMemrefF32(memref<*xf32>)
 func.func private @rtclock() -> f64
 
+// Main SpMM computation kernel
+// Parameters:
+// - values: Non-zero values of sparse matrix A in CSR format
+// - col_indices: Column indices for each non-zero in A
+// - row_pointers: Start/end indices for each row of A in values array
+// - dense: Dense input matrix B
+// - result: Output dense matrix C = A * B
 func.func @spmm(%values: memref<?xf32>, %col_indices: memref<?xi32>, 
                 %row_pointers: memref<?xi32>, %dense: memref<?x?xf32>, 
                 %result: memref<?x?xf32>) {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
   
+  // Get dimensions for iteration bounds
+  // num_rows: Number of rows in sparse matrix A and result C
+  // num_cols: Number of columns in result C
+  // dense_cols: Number of columns in dense matrix B
   %num_rows = memref.dim %result, %c0 : memref<?x?xf32>
   %num_cols = memref.dim %result, %c1 : memref<?x?xf32>
   %dense_cols = memref.dim %dense, %c1 : memref<?x?xf32>
   
+  // Start timing the computation
   %t_start = call @rtclock() : () -> f64
   
+  // Main computation loops:
+  // 1. Outer loop: Iterate over each row of sparse matrix A
+  // 2. Middle loop: Iterate over each column of dense matrix B
+  // 3. Inner loop: Process non-zeros in current row of A
   scf.for %i = %c0 to %num_rows step %c1 {
+    // Get start and end indices for current row in CSR format
+    // These indices mark the range of non-zeros in the current row
     %row_start_ptr = memref.load %row_pointers[%i] : memref<?xi32>
     %row_start = arith.index_cast %row_start_ptr : i32 to index
     
     %i_plus_1 = arith.addi %i, %c1 : index
     %row_end_ptr = memref.load %row_pointers[%i_plus_1] : memref<?xi32>
     %row_end = arith.index_cast %row_end_ptr : i32 to index
     
+    // Process each column in result matrix C
     scf.for %j = %c0 to %dense_cols step %c1 {
+      // Initialize accumulator for dot product computation
       %sum = arith.constant 0.0 : f32
       
+      // Compute dot product of current row of A with column j of B
       %result_sum = scf.for %k = %row_start to %row_end step %c1 iter_args(%current_sum = %sum) -> (f32) {
+        // Load non-zero value from A and its column index
         %val = memref.load %values[%k] : memref<?xf32>
         %col_ptr = memref.load %col_indices[%k] : memref<?xi32>
         %col = arith.index_cast %col_ptr : i32 to index
         
+        // Load corresponding value from dense matrix B
         %dense_val = memref.load %dense[%col, %j] : memref<?x?xf32>
         
+        // Multiply and accumulate into partial sum
         %prod = arith.mulf %val, %dense_val : f32
         %new_sum = arith.addf %current_sum, %prod : f32
         
         scf.yield %new_sum : f32
       }
       
+      // Store computed result in output matrix C
       memref.store %result_sum, %result[%i, %j] : memref<?x?xf32>
     }
   }
@@ -71,6 +100,8 @@ func.func @spmm(%values: memref<?xf32>, %col_indices: memref<?xi32>,
   return
 }
 
+// Helper function to allocate and initialize values array
+// Allocates memory for given size and fills with specified value
 func.func @alloc_values(%size: index, %val: f32) -> memref<?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -83,6 +114,8 @@ func.func @alloc_values(%size: index, %val: f32) -> memref<?xf32> {
   return %values : memref<?xf32>
 }
 
+// Helper function to allocate and initialize column indices array
+// Creates cyclic pattern of indices modulo pattern size
 func.func @alloc_col_indices(%size: index, %pattern: index) -> memref<?xi32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -98,6 +131,8 @@ func.func @alloc_col_indices(%size: index, %pattern: index) -> memref<?xi32> {
   return %indices : memref<?xi32>
 }
 
+// Helper function to allocate and initialize row pointers array
+// Creates regular pattern with fixed number of non-zeros per row
 func.func @alloc_row_pointers(%rows: index, %nnz_per_row: index) -> memref<?xi32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -113,6 +148,8 @@ func.func @alloc_row_pointers(%rows: index, %nnz_per_row: index) -> memref<?xi32
   return %pointers : memref<?xi32>
 }
 
+// Helper function to allocate and initialize 2D float array
+// Allocates memory for given dimensions and fills with specified value
 func.func @alloc_f32(%arg0: index, %arg1: index, %arg2: f32) -> memref<?x?xf32> {
   %c0 = arith.constant 0 : index
   %c1 = arith.constant 1 : index
@@ -127,6 +164,11 @@ func.func @alloc_f32(%arg0: index, %arg1: index, %arg2: f32) -> memref<?x?xf32>
   return %0 : memref<?x?xf32>
 }
 
+// Main function that sets up test case and runs SpMM
+// Creates a test case with:
+// - 4x4 sparse matrix with 2 non-zeros per row (values = 2.0)
+// - 4x3 dense matrix (values = 3.0)
+// - 4x3 result matrix (initialized to 0.0)
 func.func @main() {
   %c3 = arith.constant 3 : index
   %c4 = arith.constant 4 : index
@@ -135,19 +177,27 @@ func.func @main() {
   %f2 = arith.constant 2.0 : f32
   %f3 = arith.constant 3.0 : f32
   
-  //test:
+  // Test parameters:
+  // nnz: Total number of non-zeros in sparse matrix (8)
+  // nnz_per_row: Number of non-zeros per row (2)
   %nnz = arith.constant 8 : index  
   %nnz_per_row = arith.constant 2 : index  
   
+  // Allocate and initialize sparse matrix components
   %values = call @alloc_values(%nnz, %f2) : (index, f32) -> memref<?xf32>
   %col_indices = call @alloc_col_indices(%nnz, %c4) : (index, index) -> memref<?xi32>
   %row_pointers = call @alloc_row_pointers(%c4, %nnz_per_row) : (index, index) -> memref<?xi32>
 
-  //4x3
+  // Allocate and initialize dense input and result matrices
   %dense = call @alloc_f32(%c4, %c3, %f3) : (index, index, f32) -> memref<?x?xf32>
   //4x3
   %result = call @alloc_f32(%c4, %c3, %f0) : (index, index, f32) -> memref<?x?xf32>
   
+  // CHECK: Unranked Memref base@ = {{.*}} rank = 2 offset = 0 sizes = [4, 3] strides = [3, 1] data =
+  // CHECK-NEXT: [[12,   12,   12],
+  // CHECK-NEXT:  [12,   12,   12],
+  // CHECK-NEXT:  [12,   12,   12],
+  // CHECK-NEXT:  [12,   12,   12]]
   call @spmm(%values, %col_indices, %row_pointers, %dense, %result) : 
     (memref<?xf32>, memref<?xi32>, memref<?xi32>, memref<?x?xf32>, memref<?x?xf32>) -> ()
   
diff --git a/examples/BuddySparse/makefile b/examples/BuddySparse/makefile
@@ -34,4 +34,4 @@ linalg-spmm-f32-run:
 		-finalize-memref-to-llvm \
 		-reconcile-unrealized-casts | \
 	${MLIR_CPU_RUNNER} ${OPT_FLAG} -e main -entry-point-result=void \
-		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}
+		-shared-libs=${MLIR_RUNNER_UTILS} -shared-libs=${MLIR_C_RUNNER_UTILS}