`one-shot-bufferize` pass generates `memref.alloc()`s in GPU kernels code and breaks the pipeline

The input code is as follows and the insertGPUAllocs cannot deal with the following case properly. `"gpu.dealloc"(%51) : (memref<16x16xf16>) -> ()` should be inserted inside the kernel code but be inserted ouside now


<details><summary>Reproducer</summary>

`gc-opt --gc-gpu-pipeline file.mlir`

File.mlir:
```mlir
module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"CPU" : #dlti.target_device_spec<#dlti.dl_entry<"tile_size", 32 : i32>, #dlti.dl_entry<"num_threads", 4 : i32>, #dlti.dl_entry<"L1_cache_size_in_bytes", 49152 : i32>, #dlti.dl_entry<"L2_cache_size_in_bytes", 2097152 : i32>, #dlti.dl_entry<"L3_cache_size_in_bytes", 1966080 : i32>, #dlti.dl_entry<"max_vector_width", 512 : i32>>>} {
  func.func @entry(%arg0: memref<128x1024xf16>, %arg1: memref<1024x1024xf16>, %arg2: memref<128x1024xf16>, %arg3: memref<128x1024xf16>) attributes {compiletime_const_args_index = [1 : i32, 2 : i32]} {
    %0 = bufferization.to_tensor %arg0 restrict : memref<128x1024xf16>
    %1 = bufferization.to_tensor %arg1 restrict : memref<1024x1024xf16>
    %2 = bufferization.to_tensor %arg2 restrict : memref<128x1024xf16>
    %3 = tensor.empty() : tensor<1024x1024xf16>
    %transposed = linalg.transpose ins(%1 : tensor<1024x1024xf16>) outs(%3 : tensor<1024x1024xf16>) permutation = [1, 0]
    %4 = tensor.empty() : tensor<128x1024xf16>
    %cst = arith.constant 0.000000e+00 : f16
    %5 = linalg.fill ins(%cst : f16) outs(%4 : tensor<128x1024xf16>) -> tensor<128x1024xf16>
    %6 = linalg.matmul ins(%0, %transposed : tensor<128x1024xf16>, tensor<1024x1024xf16>) outs(%5 : tensor<128x1024xf16>) -> tensor<128x1024xf16>
    %7 = tensor.empty() : tensor<128x1024xf16>
    %8 = linalg.add ins(%6, %2 : tensor<128x1024xf16>, tensor<128x1024xf16>) outs(%7 : tensor<128x1024xf16>) -> tensor<128x1024xf16>
    %9 = tensor.empty() : tensor<128x1024xf16>
    %cst_0 = arith.constant 0.000000e+00 : f16
    %10 = linalg.fill ins(%cst_0 : f16) outs(%9 : tensor<128x1024xf16>) -> tensor<128x1024xf16>
    %res = tensor.empty() : tensor<128x1024xf16>
    %11 = linalg.max ins(%8, %10 : tensor<128x1024xf16>, tensor<128x1024xf16>) outs(%res : tensor<128x1024xf16>) -> tensor<128x1024xf16>
    bufferization.materialize_in_destination %11 in restrict writable %arg3 : (tensor<128x1024xf16>, memref<128x1024xf16>) -> ()
    return
  }
}
```

</details>


<details><summary>long log</summary>

[test.txt](https://github.com/user-attachments/files/17125360/test.txt)

```mlir
/home/jovyan/code/graph-compiler/build/gpu_input.mlir:23:13: error: operand #0 does not dominate this use
      %20 = linalg.fill ins(%cst_0 : f16) outs(%extracted_slice_9 : tensor<16x16xf16>) -> tensor<16x16xf16>
            ^
/home/jovyan/code/graph-compiler/build/gpu_input.mlir:23:13: note: see current operation: "gpu.dealloc"(%159) : (memref<16x16xf16>) -> ()
/home/jovyan/code/graph-compiler/build/gpu_input.mlir:23:13: note: operand defined here (op in a child region)
// -----// IR Dump After InsertGPUAllocs Failed (insert-gpu-allocs) //----- //
"func.func"() <{function_type = (memref<1024x1024xf16>, memref<256x1024xf16>, memref<256x1024xf16>, memref<256x1024xf16>, memref<256x1024xf16>, memref<256x1024xf16>, memref<i8>) -> (), sym_name = "entry"}> ({
^bb0(%arg0: memref<1024x1024xf16>, %arg1: memref<256x1024xf16>, %arg2: memref<256x1024xf16>, %arg3: memref<256x1024xf16>, %arg4: memref<256x1024xf16>, %arg5: memref<256x1024xf16>, %arg6: memref<i8>):
  %0 = "arith.constant"() <{value = dense<0.000000e+00> : vector<256xf16>}> : () -> vector<256xf16>
  %1 = "arith.constant"() <{value = 8 : index}> : () -> index
  %2 = "arith.constant"() <{value = 16 : index}> : () -> index
  %3 = "arith.constant"() <{value = 1024 : index}> : () -> index
  %4 = "arith.constant"() <{value = 256 : index}> : () -> index
  %5 = "arith.constant"() <{value = 0 : index}> : () -> index
  %6 = "gpu.alloc"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> memref<256x1024xf16>
  %7 = "gpu.alloc"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> memref<256x1024xf16>
  %8 = "gpu.alloc"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> memref<256x1024xf16>
  %9 = "arith.constant"() <{value = 1 : index}> : () -> index
  %10 = "affine.apply"(%4, %5, %2) <{map = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>}> : (index, index, index) -> index
  %11 = "affine.apply"(%3, %5, %2) <{map = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>}> : (index, index, index) -> index
  "gpu.launch"(%10, %11, %9, %9, %9, %9) <{operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0>}> ({
  ^bb0(%arg41: index, %arg42: index, %arg43: index, %arg44: index, %arg45: index, %arg46: index, %arg47: index, %arg48: index, %arg49: index, %arg50: index, %arg51: index, %arg52: index):
    %131 = "affine.apply"(%arg41, %2, %5) <{map = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>}> : (index, index, index) -> index
    %132 = "affine.apply"(%arg42, %2, %5) <{map = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>}> : (index, index, index) -> index
    %133 = "xegpu.create_nd_tdesc"(%6, %131, %132) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %134 = "xegpu.update_nd_offset"(%133, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %135 = "vector.shape_cast"(%0) : (vector<256xf16>) -> vector<16x16xf16>
    "xegpu.store_nd"(%135, %134) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    %136 = "xegpu.create_nd_tdesc"(%6, %131, %132) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %137 = "xegpu.update_nd_offset"(%136, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %138 = "xegpu.update_nd_offset"(%136, %1, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %139 = "xegpu.load_nd"(%137) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<8x16xf16>
    %140 = "xegpu.load_nd"(%138) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<8x16xf16>
    %141 = "arith.extf"(%139) : (vector<8x16xf16>) -> vector<8x16xf32>
    %142 = "arith.extf"(%140) : (vector<8x16xf16>) -> vector<8x16xf32>
    %143 = "xegpu.create_nd_tdesc"(%arg1, %131) <{const_offsets = array<i64: -9223372036854775808, 0>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 1, 0, 0>}> : (memref<256x1024xf16>, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %144 = "xegpu.update_nd_offset"(%143, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %145 = "xegpu.create_nd_tdesc"(%arg0, %132) <{const_offsets = array<i64: 0, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 1, 0, 0>}> : (memref<1024x1024xf16>, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %146 = "xegpu.update_nd_offset"(%145, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %147:4 = "scf.for"(%5, %3, %2, %141, %142, %144, %146) ({
    ^bb0(%arg53: index, %arg54: vector<8x16xf32>, %arg55: vector<8x16xf32>, %arg56: !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, %arg57: !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>):
      %172 = "arith.remui"(%arg53, %3) : (index, index) -> index
      %173 = "arith.cmpi"(%172, %5) <{predicate = 0 : i64}> : (index, index) -> i1
      "scf.if"(%173) ({
        "gpu.barrier"() : () -> ()
        "scf.yield"() : () -> ()
      }, {
      }) : (i1) -> ()
      %174 = "xegpu.load_nd"(%arg56) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
      %175 = "xegpu.load_nd"(%arg57) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<8x16x2xf16>
      %176 = "xegpu.update_nd_offset"(%arg56, %5, %2) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
      %177 = "xegpu.update_nd_offset"(%arg57, %2, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
      "xegpu.prefetch_nd"(%176) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
      "xegpu.prefetch_nd"(%177) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
      %178 = "vector.shape_cast"(%174) : (vector<16x16xf16>) -> vector<256xf16>
      %179 = "vector.extract_strided_slice"(%178) <{offsets = [0], sizes = [128], strides = [1]}> : (vector<256xf16>) -> vector<128xf16>
      %180 = "vector.shape_cast"(%179) : (vector<128xf16>) -> vector<8x8x2xf16>
      %181 = "vector.extract_strided_slice"(%178) <{offsets = [128], sizes = [128], strides = [1]}> : (vector<256xf16>) -> vector<128xf16>
      %182 = "vector.shape_cast"(%181) : (vector<128xf16>) -> vector<8x8x2xf16>
      %183 = "xegpu.dpas"(%180, %175, %arg54) : (vector<8x8x2xf16>, vector<8x16x2xf16>, vector<8x16xf32>) -> vector<8x16xf32>
      %184 = "xegpu.dpas"(%182, %175, %arg55) : (vector<8x8x2xf16>, vector<8x16x2xf16>, vector<8x16xf32>) -> vector<8x16xf32>
      "scf.yield"(%183, %184, %176, %177) : (vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    }) : (index, index, index, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> (vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>)
    %148 = "arith.truncf"(%147#0) : (vector<8x16xf32>) -> vector<8x16xf16>
    %149 = "arith.truncf"(%147#1) : (vector<8x16xf32>) -> vector<8x16xf16>
    "xegpu.store_nd"(%148, %137) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    "xegpu.store_nd"(%149, %138) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    %150 = "xegpu.create_nd_tdesc"(%6, %131, %132) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %151 = "xegpu.update_nd_offset"(%150, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %152 = "xegpu.load_nd"(%151) : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
    %153 = "xegpu.create_nd_tdesc"(%arg2, %131, %132) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %154 = "xegpu.update_nd_offset"(%153, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %155 = "xegpu.load_nd"(%154) : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
    %156 = "arith.addf"(%152, %155) <{fastmath = #arith.fastmath<none>}> : (vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16>
    %157 = "xegpu.create_nd_tdesc"(%7, %131, %132) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %158 = "xegpu.update_nd_offset"(%157, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    "xegpu.store_nd"(%156, %158) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    %159 = "gpu.alloc"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> memref<16x16xf16>
    %160 = "xegpu.create_nd_tdesc"(%159, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 16, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<16x16xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %161 = "xegpu.update_nd_offset"(%160, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %162 = "vector.shape_cast"(%0) : (vector<256xf16>) -> vector<16x16xf16>
    "xegpu.store_nd"(%162, %161) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    %163 = "xegpu.create_nd_tdesc"(%7, %131, %132) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %164 = "xegpu.update_nd_offset"(%163, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %165 = "xegpu.load_nd"(%164) : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
    %166 = "xegpu.create_nd_tdesc"(%159, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 16, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<16x16xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %167 = "xegpu.update_nd_offset"(%166, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %168 = "xegpu.load_nd"(%167) : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
    %169 = "arith.maximumf"(%165, %168) <{fastmath = #arith.fastmath<none>}> : (vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16>
    %170 = "xegpu.create_nd_tdesc"(%8, %131, %132) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %171 = "xegpu.update_nd_offset"(%170, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    "xegpu.store_nd"(%169, %171) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    "gpu.terminator"() : () -> ()
  }) {SCFToGPU_visited, workgroup_attributions = 0 : i64} : (index, index, index, index, index, index) -> ()
  %12 = "gpu.alloc"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> memref<256x1024xf16>
  %13 = "gpu.alloc"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> memref<256x1024xf16>
  %14 = "gpu.alloc"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> memref<256x1024xf16>
  %15 = "arith.constant"() <{value = 1 : index}> : () -> index
  %16 = "affine.apply"(%4, %5, %2) <{map = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>}> : (index, index, index) -> index
  %17 = "affine.apply"(%3, %5, %2) <{map = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>}> : (index, index, index) -> index
  "gpu.launch"(%16, %17, %15, %15, %15, %15) <{operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0>}> ({
  ^bb0(%arg24: index, %arg25: index, %arg26: index, %arg27: index, %arg28: index, %arg29: index, %arg30: index, %arg31: index, %arg32: index, %arg33: index, %arg34: index, %arg35: index):
    %77 = "affine.apply"(%arg24, %2, %5) <{map = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>}> : (index, index, index) -> index
    %78 = "affine.apply"(%arg25, %2, %5) <{map = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>}> : (index, index, index) -> index
    %79 = "xegpu.create_nd_tdesc"(%12, %77, %78) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %80 = "xegpu.update_nd_offset"(%79, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %81 = "vector.shape_cast"(%0) : (vector<256xf16>) -> vector<16x16xf16>
    "xegpu.store_nd"(%81, %80) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    %82 = "xegpu.create_nd_tdesc"(%12, %77, %78) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %83 = "xegpu.update_nd_offset"(%82, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %84 = "xegpu.update_nd_offset"(%82, %1, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %85 = "xegpu.load_nd"(%83) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<8x16xf16>
    %86 = "xegpu.load_nd"(%84) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<8x16xf16>
    %87 = "arith.extf"(%85) : (vector<8x16xf16>) -> vector<8x16xf32>
    %88 = "arith.extf"(%86) : (vector<8x16xf16>) -> vector<8x16xf32>
    %89 = "xegpu.create_nd_tdesc"(%8, %77) <{const_offsets = array<i64: -9223372036854775808, 0>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 1, 0, 0>}> : (memref<256x1024xf16>, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %90 = "xegpu.update_nd_offset"(%89, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %91 = "xegpu.create_nd_tdesc"(%arg0, %78) <{const_offsets = array<i64: 0, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 1, 0, 0>}> : (memref<1024x1024xf16>, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %92 = "xegpu.update_nd_offset"(%91, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %93:4 = "scf.for"(%5, %3, %2, %87, %88, %90, %92) ({
    ^bb0(%arg36: index, %arg37: vector<8x16xf32>, %arg38: vector<8x16xf32>, %arg39: !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, %arg40: !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>):
      %118 = "arith.remui"(%arg36, %3) : (index, index) -> index
      %119 = "arith.cmpi"(%118, %5) <{predicate = 0 : i64}> : (index, index) -> i1
      "scf.if"(%119) ({
        "gpu.barrier"() : () -> ()
        "scf.yield"() : () -> ()
      }, {
      }) : (i1) -> ()
      %120 = "xegpu.load_nd"(%arg39) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
      %121 = "xegpu.load_nd"(%arg40) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<8x16x2xf16>
      %122 = "xegpu.update_nd_offset"(%arg39, %5, %2) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
      %123 = "xegpu.update_nd_offset"(%arg40, %2, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
      "xegpu.prefetch_nd"(%122) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
      "xegpu.prefetch_nd"(%123) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
      %124 = "vector.shape_cast"(%120) : (vector<16x16xf16>) -> vector<256xf16>
      %125 = "vector.extract_strided_slice"(%124) <{offsets = [0], sizes = [128], strides = [1]}> : (vector<256xf16>) -> vector<128xf16>
      %126 = "vector.shape_cast"(%125) : (vector<128xf16>) -> vector<8x8x2xf16>
      %127 = "vector.extract_strided_slice"(%124) <{offsets = [128], sizes = [128], strides = [1]}> : (vector<256xf16>) -> vector<128xf16>
      %128 = "vector.shape_cast"(%127) : (vector<128xf16>) -> vector<8x8x2xf16>
      %129 = "xegpu.dpas"(%126, %121, %arg37) : (vector<8x8x2xf16>, vector<8x16x2xf16>, vector<8x16xf32>) -> vector<8x16xf32>
      %130 = "xegpu.dpas"(%128, %121, %arg38) : (vector<8x8x2xf16>, vector<8x16x2xf16>, vector<8x16xf32>) -> vector<8x16xf32>
      "scf.yield"(%129, %130, %122, %123) : (vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    }) : (index, index, index, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> (vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>)
    %94 = "arith.truncf"(%93#0) : (vector<8x16xf32>) -> vector<8x16xf16>
    %95 = "arith.truncf"(%93#1) : (vector<8x16xf32>) -> vector<8x16xf16>
    "xegpu.store_nd"(%94, %83) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    "xegpu.store_nd"(%95, %84) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    %96 = "xegpu.create_nd_tdesc"(%12, %77, %78) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %97 = "xegpu.update_nd_offset"(%96, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %98 = "xegpu.load_nd"(%97) : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
    %99 = "xegpu.create_nd_tdesc"(%arg3, %77, %78) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %100 = "xegpu.update_nd_offset"(%99, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %101 = "xegpu.load_nd"(%100) : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
    %102 = "arith.addf"(%98, %101) <{fastmath = #arith.fastmath<none>}> : (vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16>
    %103 = "xegpu.create_nd_tdesc"(%13, %77, %78) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %104 = "xegpu.update_nd_offset"(%103, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    "xegpu.store_nd"(%102, %104) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    %105 = "gpu.alloc"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> memref<16x16xf16>
    %106 = "xegpu.create_nd_tdesc"(%105, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 16, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<16x16xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %107 = "xegpu.update_nd_offset"(%106, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %108 = "vector.shape_cast"(%0) : (vector<256xf16>) -> vector<16x16xf16>
    "xegpu.store_nd"(%108, %107) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    %109 = "xegpu.create_nd_tdesc"(%13, %77, %78) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %110 = "xegpu.update_nd_offset"(%109, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %111 = "xegpu.load_nd"(%110) : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
    %112 = "xegpu.create_nd_tdesc"(%105, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 16, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<16x16xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %113 = "xegpu.update_nd_offset"(%112, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %114 = "xegpu.load_nd"(%113) : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
    %115 = "arith.maximumf"(%111, %114) <{fastmath = #arith.fastmath<none>}> : (vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16>
    %116 = "xegpu.create_nd_tdesc"(%14, %77, %78) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %117 = "xegpu.update_nd_offset"(%116, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    "xegpu.store_nd"(%115, %117) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    "gpu.terminator"() : () -> ()
  }) {SCFToGPU_visited, workgroup_attributions = 0 : i64} : (index, index, index, index, index, index) -> ()
  %18 = "gpu.alloc"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> memref<256x1024xf16>
  %19 = "gpu.alloc"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> memref<256x1024xf16>
  %20 = "arith.constant"() <{value = 1 : index}> : () -> index
  %21 = "affine.apply"(%4, %5, %2) <{map = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>}> : (index, index, index) -> index
  %22 = "affine.apply"(%3, %5, %2) <{map = affine_map<(d0)[s0, s1] -> ((d0 - s0) ceildiv s1)>}> : (index, index, index) -> index
  "gpu.launch"(%21, %22, %20, %20, %20, %20) <{operandSegmentSizes = array<i32: 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0>}> ({
  ^bb0(%arg7: index, %arg8: index, %arg9: index, %arg10: index, %arg11: index, %arg12: index, %arg13: index, %arg14: index, %arg15: index, %arg16: index, %arg17: index, %arg18: index):
    %23 = "affine.apply"(%arg7, %2, %5) <{map = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>}> : (index, index, index) -> index
    %24 = "affine.apply"(%arg8, %2, %5) <{map = affine_map<(d0)[s0, s1] -> (d0 * s0 + s1)>}> : (index, index, index) -> index
    %25 = "xegpu.create_nd_tdesc"(%18, %23, %24) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %26 = "xegpu.update_nd_offset"(%25, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %27 = "vector.shape_cast"(%0) : (vector<256xf16>) -> vector<16x16xf16>
    "xegpu.store_nd"(%27, %26) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    %28 = "xegpu.create_nd_tdesc"(%18, %23, %24) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %29 = "xegpu.update_nd_offset"(%28, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %30 = "xegpu.update_nd_offset"(%28, %1, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %31 = "xegpu.load_nd"(%29) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<8x16xf16>
    %32 = "xegpu.load_nd"(%30) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<8x16xf16>
    %33 = "arith.extf"(%31) : (vector<8x16xf16>) -> vector<8x16xf32>
    %34 = "arith.extf"(%32) : (vector<8x16xf16>) -> vector<8x16xf32>
    %35 = "xegpu.create_nd_tdesc"(%14, %23) <{const_offsets = array<i64: -9223372036854775808, 0>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 1, 0, 0>}> : (memref<256x1024xf16>, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %36 = "xegpu.update_nd_offset"(%35, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %37 = "xegpu.create_nd_tdesc"(%arg0, %24) <{const_offsets = array<i64: 0, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 1, 0, 0>}> : (memref<1024x1024xf16>, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %38 = "xegpu.update_nd_offset"(%37, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %39:4 = "scf.for"(%5, %3, %2, %33, %34, %36, %38) ({
    ^bb0(%arg19: index, %arg20: vector<8x16xf32>, %arg21: vector<8x16xf32>, %arg22: !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, %arg23: !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>):
      %64 = "arith.remui"(%arg19, %3) : (index, index) -> index
      %65 = "arith.cmpi"(%64, %5) <{predicate = 0 : i64}> : (index, index) -> i1
      "scf.if"(%65) ({
        "gpu.barrier"() : () -> ()
        "scf.yield"() : () -> ()
      }, {
      }) : (i1) -> ()
      %66 = "xegpu.load_nd"(%arg22) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
      %67 = "xegpu.load_nd"(%arg23) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>, packed}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<8x16x2xf16>
      %68 = "xegpu.update_nd_offset"(%arg22, %5, %2) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
      %69 = "xegpu.update_nd_offset"(%arg23, %2, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
      "xegpu.prefetch_nd"(%68) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
      "xegpu.prefetch_nd"(%69) <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<cached>, l3_hint = #xegpu.cache_hint<cached>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
      %70 = "vector.shape_cast"(%66) : (vector<16x16xf16>) -> vector<256xf16>
      %71 = "vector.extract_strided_slice"(%70) <{offsets = [0], sizes = [128], strides = [1]}> : (vector<256xf16>) -> vector<128xf16>
      %72 = "vector.shape_cast"(%71) : (vector<128xf16>) -> vector<8x8x2xf16>
      %73 = "vector.extract_strided_slice"(%70) <{offsets = [128], sizes = [128], strides = [1]}> : (vector<256xf16>) -> vector<128xf16>
      %74 = "vector.shape_cast"(%73) : (vector<128xf16>) -> vector<8x8x2xf16>
      %75 = "xegpu.dpas"(%72, %67, %arg20) : (vector<8x8x2xf16>, vector<8x16x2xf16>, vector<8x16xf32>) -> vector<8x16xf32>
      %76 = "xegpu.dpas"(%74, %67, %arg21) : (vector<8x8x2xf16>, vector<8x16x2xf16>, vector<8x16xf32>) -> vector<8x16xf32>
      "scf.yield"(%75, %76, %68, %69) : (vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    }) : (index, index, index, vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> (vector<8x16xf32>, vector<8x16xf32>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>)
    %40 = "arith.truncf"(%39#0) : (vector<8x16xf32>) -> vector<8x16xf16>
    %41 = "arith.truncf"(%39#1) : (vector<8x16xf32>) -> vector<8x16xf16>
    "xegpu.store_nd"(%40, %29) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    "xegpu.store_nd"(%41, %30) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<8x16xf16>, !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    %42 = "xegpu.create_nd_tdesc"(%18, %23, %24) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %43 = "xegpu.update_nd_offset"(%42, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %44 = "xegpu.load_nd"(%43) : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
    %45 = "xegpu.create_nd_tdesc"(%arg4, %23, %24) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %46 = "xegpu.update_nd_offset"(%45, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %47 = "xegpu.load_nd"(%46) : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
    %48 = "arith.addf"(%44, %47) <{fastmath = #arith.fastmath<none>}> : (vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16>
    %49 = "xegpu.create_nd_tdesc"(%19, %23, %24) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %50 = "xegpu.update_nd_offset"(%49, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    "xegpu.store_nd"(%48, %50) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    %51 = "gpu.alloc"() <{operandSegmentSizes = array<i32: 0, 0, 0>}> : () -> memref<16x16xf16>
    %52 = "xegpu.create_nd_tdesc"(%51, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 16, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<16x16xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %53 = "xegpu.update_nd_offset"(%52, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %54 = "vector.shape_cast"(%0) : (vector<256xf16>) -> vector<16x16xf16>
    "xegpu.store_nd"(%54, %53) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    %55 = "xegpu.create_nd_tdesc"(%19, %23, %24) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %56 = "xegpu.update_nd_offset"(%55, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %57 = "xegpu.load_nd"(%56) : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
    %58 = "xegpu.create_nd_tdesc"(%51, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 16, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<16x16xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %59 = "xegpu.update_nd_offset"(%58, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %60 = "xegpu.load_nd"(%59) : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> vector<16x16xf16>
    %61 = "arith.maximumf"(%57, %60) <{fastmath = #arith.fastmath<none>}> : (vector<16x16xf16>, vector<16x16xf16>) -> vector<16x16xf16>
    %62 = "xegpu.create_nd_tdesc"(%arg5, %23, %24) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>, const_strides = array<i64: 1024, 1>, operandSegmentSizes = array<i32: 1, 2, 0, 0>}> : (memref<256x1024xf16>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    %63 = "xegpu.update_nd_offset"(%62, %5, %5) <{const_offsets = array<i64: -9223372036854775808, -9223372036854775808>}> : (!xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>, index, index) -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>
    "xegpu.store_nd"(%61, %63) <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<write_back>, l3_hint = #xegpu.cache_hint<write_back>}> : (vector<16x16xf16>, !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr<memory_scope =  global, array_length = 1 : i64, boundary_check = true>>) -> ()
    "gpu.terminator"() : () -> ()
  }) {SCFToGPU_visited, workgroup_attributions = 0 : i64} : (index, index, index, index, index, index) -> ()
  "gpu.dealloc"(%6) : (memref<256x1024xf16>) -> ()
  "gpu.dealloc"(%7) : (memref<256x1024xf16>) -> ()
  "gpu.dealloc"(%159) : (memref<16x16xf16>) -> ()
  "gpu.dealloc"(%8) : (memref<256x1024xf16>) -> ()
  "gpu.dealloc"(%12) : (memref<256x1024xf16>) -> ()
  "gpu.dealloc"(%13) : (memref<256x1024xf16>) -> ()
  "gpu.dealloc"(%105) : (memref<16x16xf16>) -> ()
  "gpu.dealloc"(%14) : (memref<256x1024xf16>) -> ()
  "gpu.dealloc"(%18) : (memref<256x1024xf16>) -> ()
  "gpu.dealloc"(%19) : (memref<256x1024xf16>) -> ()
  "gpu.dealloc"(%51) : (memref<16x16xf16>) -> ()
  "func.return"() : () -> ()
}) : () -> ()
```

</details>


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

`one-shot-bufferize` pass generates `memref.alloc()`s in GPU kernels code and breaks the pipeline #360

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

one-shot-bufferize pass generates memref.alloc()s in GPU kernels code and breaks the pipeline #360

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

`one-shot-bufferize` pass generates `memref.alloc()`s in GPU kernels code and breaks the pipeline #360