|
| 1 | +// RUN: buddy-opt %s \ |
| 2 | +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith))" \ |
| 3 | +// RUN: | buddy-opt \ |
| 4 | +// RUN: -eliminate-empty-tensors \ |
| 5 | +// RUN: -empty-tensor-to-alloc-tensor \ |
| 6 | +// RUN: -convert-elementwise-to-linalg \ |
| 7 | +// RUN: -one-shot-bufferize="unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" \ |
| 8 | +// RUN: -expand-strided-metadata \ |
| 9 | +// RUN: -ownership-based-buffer-deallocation \ |
| 10 | +// RUN: -buffer-deallocation-simplification \ |
| 11 | +// RUN: -bufferization-lower-deallocations \ |
| 12 | +// RUN: -convert-bufferization-to-memref \ |
| 13 | +// RUN: -matmul-parallel-vectorization-optimize \ |
| 14 | +// RUN: -batchmatmul-optimize \ |
| 15 | +// RUN: -convert-linalg-to-affine-loops \ |
| 16 | +// RUN: -affine-loop-fusion \ |
| 17 | +// RUN: -affine-parallelize \ |
| 18 | +// RUN: -convert-vector-to-scf \ |
| 19 | +// RUN: -lower-affine \ |
| 20 | +// RUN: -convert-scf-to-openmp \ |
| 21 | +// RUN: -func-bufferize-dynamic-offset \ |
| 22 | +// RUN: -cse \ |
| 23 | +// RUN: -memref-expand \ |
| 24 | +// RUN: -arith-expand \ |
| 25 | +// RUN: -convert-vector-to-llvm \ |
| 26 | +// RUN: -convert-arith-to-llvm \ |
| 27 | +// RUN: -finalize-memref-to-llvm \ |
| 28 | +// RUN: -convert-scf-to-cf \ |
| 29 | +// RUN: -convert-cf-to-llvm \ |
| 30 | +// RUN: -convert-openmp-to-llvm \ |
| 31 | +// RUN: -convert-arith-to-llvm \ |
| 32 | +// RUN: -convert-math-to-llvm \ |
| 33 | +// RUN: -convert-math-to-libm \ |
| 34 | +// RUN: -convert-func-to-llvm \ |
| 35 | +// RUN: -reconcile-unrealized-casts \ |
| 36 | +// RUN: | mlir-runner -e main -entry-point-result=void \ |
| 37 | +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ |
| 38 | +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ |
| 39 | +// RUN: -shared-libs=%mlir_runner_utils_dir/libomp%shlibext \ |
| 40 | +// RUN: | FileCheck %s |
| 41 | + |
| 42 | +func.func private @rtclock() -> f64 |
| 43 | +func.func private @printMemrefF32(%ptr : tensor<*xf32>) |
| 44 | + |
| 45 | +#map = affine_map<(d0, d1, d2) -> (d1)> |
| 46 | +#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> |
| 47 | +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> |
| 48 | +#map3 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> |
| 49 | +#map4 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> |
| 50 | + |
| 51 | +func.func @kernel(%arg0: tensor<1x1024x1536xf32>, %arg1: tensor<1536xf32>) -> tensor<1x1024x1536xf32> { |
| 52 | + %t_start = call @rtclock() : () -> f64 |
| 53 | + |
| 54 | + %eps = arith.constant 9.99999997E-7 : f32 |
| 55 | + %zero = arith.constant 0.0 : f32 |
| 56 | + %c0 = arith.constant 0 : index |
| 57 | + %c1 = arith.constant 1 : index |
| 58 | + %c1024 = arith.constant 1024 : index |
| 59 | + %c1536 = arith.constant 1536 : index |
| 60 | + %dim = arith.constant 1.536000e+03 : f32 |
| 61 | + |
| 62 | + %x_memref = bufferization.to_memref %arg0 : tensor<1x1024x1536xf32> to memref<1x1024x1536xf32> |
| 63 | + %g_memref = bufferization.to_memref %arg1 : tensor<1536xf32> to memref<1536xf32> |
| 64 | + %y_memref = memref.alloc() : memref<1x1024x1536xf32> |
| 65 | + |
| 66 | + scf.for %b = %c0 to %c1024 step %c1 { |
| 67 | + %acc= scf.parallel (%i) = (%c0) to (%c1536) step (%c1) init(%zero) -> (f32) { |
| 68 | + %x = memref.load %x_memref[%c0, %b, %i] : memref<1x1024x1536xf32> |
| 69 | + %x_sq = arith.mulf %x, %x : f32 |
| 70 | + scf.reduce(%x_sq : f32) { |
| 71 | + ^bb0(%lhs : f32, %rhs: f32): |
| 72 | + %res = arith.addf %lhs, %rhs : f32 |
| 73 | + scf.reduce.return %res : f32 |
| 74 | + } |
| 75 | + } |
| 76 | + %mean = arith.divf %acc, %dim : f32 |
| 77 | + %m_eps = arith.addf %mean, %eps : f32 |
| 78 | + %inv_rms = math.rsqrt %m_eps : f32 |
| 79 | + scf.for %i = %c0 to %c1536 step %c1 { |
| 80 | + %x = memref.load %x_memref[%c0, %b, %i] : memref<1x1024x1536xf32> |
| 81 | + %g = memref.load %g_memref[%i] : memref<1536xf32> |
| 82 | + %x_norm = arith.mulf %x, %inv_rms : f32 |
| 83 | + %y = arith.mulf %x_norm, %g : f32 |
| 84 | + memref.store %y, %y_memref[%c0, %b, %i] : memref<1x1024x1536xf32> |
| 85 | + } |
| 86 | + } |
| 87 | + |
| 88 | + %out = bufferization.to_tensor %y_memref restrict : memref<1x1024x1536xf32> to tensor<1x1024x1536xf32> |
| 89 | + |
| 90 | + %t_end = call @rtclock() : () -> f64 |
| 91 | + %time = arith.subf %t_end, %t_start : f64 |
| 92 | + |
| 93 | + // Print timings. |
| 94 | + vector.print %time : f64 |
| 95 | + // CHECK: {{[0-9]+\.[0-9]+}} |
| 96 | + |
| 97 | + return %out : tensor<1x1024x1536xf32> |
| 98 | +} |
| 99 | + |
| 100 | +func.func @main() { |
| 101 | + |
| 102 | + %cst_3 = arith.constant 3.0 : f32 |
| 103 | + %empty_0 = tensor.empty() : tensor<1x1024x1536xf32> |
| 104 | + %c0 = linalg.fill ins(%cst_3 : f32) outs(%empty_0 : tensor<1x1024x1536xf32>) -> tensor<1x1024x1536xf32> |
| 105 | + |
| 106 | + %cst_2 = arith.constant 2.0 : f32 |
| 107 | + %empty_1 = tensor.empty() : tensor<1536xf32> |
| 108 | + %c1 = linalg.fill ins(%cst_2 : f32) outs(%empty_1 : tensor<1536xf32>) -> tensor<1536xf32> |
| 109 | + |
| 110 | + %res = call @kernel(%c0, %c1) : (tensor<1x1024x1536xf32>, tensor<1536xf32>) -> tensor<1x1024x1536xf32> |
| 111 | + |
| 112 | + %tensor_unranked = tensor.cast %res : tensor<1x1024x1536xf32> to tensor<*xf32> |
| 113 | + // Print results. |
| 114 | +// call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () |
| 115 | + |
| 116 | + return |
| 117 | +} |
0 commit comments