diff --git a/.gitmodules b/.gitmodules index 77bef44d1e..294a3091cf 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,7 +1,7 @@ [submodule "llvm"] path = llvm url = https://github.com/llvm/llvm-project.git - branch = main + branch = llvmorg-21.1.0-rc2 shallow = true [submodule "thirdparty/mimalloc"] path = thirdparty/mimalloc diff --git a/backend/llvm/lib/CodeGen/CMakeLists.txt b/backend/llvm/lib/CodeGen/CMakeLists.txt index 55110234cf..f52d5ffaa4 100644 --- a/backend/llvm/lib/CodeGen/CMakeLists.txt +++ b/backend/llvm/lib/CodeGen/CMakeLists.txt @@ -31,10 +31,10 @@ add_llvm_component_library(LLVMBuddyCodeGen ${LLVM_CodeGen_DIR}/DwarfEHPrepare.cpp ${LLVM_CodeGen_DIR}/EarlyIfConversion.cpp ${LLVM_CodeGen_DIR}/EdgeBundles.cpp - ${LLVM_CodeGen_DIR}/EHContGuardCatchret.cpp + ${LLVM_CodeGen_DIR}/EHContGuardTargets.cpp ${LLVM_CodeGen_DIR}/ExecutionDomainFix.cpp ${LLVM_CodeGen_DIR}/ExpandLargeDivRem.cpp - ${LLVM_CodeGen_DIR}/ExpandLargeFpConvert.cpp + ${LLVM_CodeGen_DIR}/ExpandFp.cpp ${LLVM_CodeGen_DIR}/ExpandMemCmp.cpp ${LLVM_CodeGen_DIR}/ExpandPostRAPseudos.cpp ${LLVM_CodeGen_DIR}/ExpandReductions.cpp diff --git a/backend/llvm/lib/IR/CMakeLists.txt b/backend/llvm/lib/IR/CMakeLists.txt index 2ffa737b42..4f1882ac3f 100644 --- a/backend/llvm/lib/IR/CMakeLists.txt +++ b/backend/llvm/lib/IR/CMakeLists.txt @@ -74,7 +74,6 @@ add_llvm_component_library(LLVMBuddyCore ${LLVM_IR_DIR}/User.cpp ${LLVM_IR_DIR}/Value.cpp ${LLVM_IR_DIR}/ValueSymbolTable.cpp - ${LLVM_IR_DIR}/VectorBuilder.cpp ${LLVM_IR_DIR}/VectorTypeUtils.cpp ${LLVM_IR_DIR}/Verifier.cpp ${LLVM_IR_DIR}/VFABIDemangler.cpp diff --git a/backend/llvm/lib/Target/CMakeLists.txt b/backend/llvm/lib/Target/CMakeLists.txt index 1dd5cd34f3..0115351ad5 100644 --- a/backend/llvm/lib/Target/CMakeLists.txt +++ b/backend/llvm/lib/Target/CMakeLists.txt @@ -6,7 +6,6 @@ set(LLVM_Target_DIR ${LLVM_MAIN_SRC_DIR}/lib/Target) add_llvm_component_library(LLVMBuddyTarget ${LLVM_Target_DIR}/Target.cpp - ${LLVM_Target_DIR}/TargetIntrinsicInfo.cpp ${LLVM_Target_DIR}/TargetLoweringObjectFile.cpp ${LLVM_Target_DIR}/TargetMachine.cpp ${LLVM_Target_DIR}/TargetMachineC.cpp diff --git a/backend/llvm/lib/Target/RISCV/CMakeLists.txt b/backend/llvm/lib/Target/RISCV/CMakeLists.txt index 12e3371e09..aaf345f51f 100644 --- a/backend/llvm/lib/Target/RISCV/CMakeLists.txt +++ b/backend/llvm/lib/Target/RISCV/CMakeLists.txt @@ -49,6 +49,7 @@ tablegen(LLVM RISCVGenRegisterInfo.inc -gen-register-info) tablegen(LLVM RISCVGenSearchableTables.inc -gen-searchable-tables) tablegen(LLVM RISCVGenSubtargetInfo.inc -gen-subtarget) tablegen(LLVM RISCVGenExegesis.inc -gen-exegesis) +tablegen(LLVM RISCVGenSDNodeInfo.inc -gen-sd-node-info) set(LLVM_TARGET_DEFINITIONS ${LLVM_TARGET_RISCV_DIR}/RISCVGISel.td) tablegen(LLVM RISCVGenGlobalISel.inc -gen-global-isel) @@ -189,42 +190,30 @@ endforeach() # Build LLVMBuddyRISCVDesc target. add_llvm_component_library(LLVMBuddyRISCVDesc - ${CMAKE_CURRENT_BINARY_DIR}/RISCVAsmPrinter.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVCallingConv.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVCodeGenPrepare.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVConstantPoolValue.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVDeadRegisterDefinitions.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVExpandAtomicPseudoInsts.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVExpandPseudoInsts.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVFrameLowering.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVGatherScatterLowering.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVISelDAGToDAG.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVLandingPadSetup.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVMergeBaseOffset.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVMoveMerger.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVOptWInstrs.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVPostRAExpandPseudoInsts.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVPushPopOptimizer.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVRedundantCopyElimination.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVRegisterInfo.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVSelectionDAGInfo.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVSubtarget.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVTargetMachine.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVTargetObjectFile.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVTargetTransformInfo.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVVectorMaskDAGMutation.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVVectorPeephole.cpp - ${CMAKE_CURRENT_BINARY_DIR}/RISCVZacasABIFix.cpp - + ${CMAKE_CURRENT_BINARY_DIR}/RISCVAsmBackend.cpp + ${CMAKE_CURRENT_BINARY_DIR}/RISCVBaseInfo.cpp + ${CMAKE_CURRENT_BINARY_DIR}/RISCVELFObjectWriter.cpp + ${CMAKE_CURRENT_BINARY_DIR}/RISCVInstPrinter.cpp + ${CMAKE_CURRENT_BINARY_DIR}/RISCVMCAsmInfo.cpp + ${CMAKE_CURRENT_BINARY_DIR}/RISCVMCCodeEmitter.cpp + ${CMAKE_CURRENT_BINARY_DIR}/RISCVMCExpr.cpp + ${CMAKE_CURRENT_BINARY_DIR}/RISCVMCObjectFileInfo.cpp + ${CMAKE_CURRENT_BINARY_DIR}/RISCVMCTargetDesc.cpp + ${CMAKE_CURRENT_BINARY_DIR}/RISCVMatInt.cpp + ${CMAKE_CURRENT_BINARY_DIR}/RISCVTargetStreamer.cpp + ${CMAKE_CURRENT_BINARY_DIR}/RISCVELFStreamer.cpp # Add *.h files to track the copies above. - ${CMAKE_CURRENT_BINARY_DIR}/RISCVCallingConv.h - ${CMAKE_CURRENT_BINARY_DIR}/RISCVInstrInfo.h - ${CMAKE_CURRENT_BINARY_DIR}/RISCVRegisterInfo.h - ${CMAKE_CURRENT_BINARY_DIR}/RISCVSelectionDAGInfo.h - ${CMAKE_CURRENT_BINARY_DIR}/RISCVSubtarget.h - ${CMAKE_CURRENT_BINARY_DIR}/RISCVTargetMachine.h - ${CMAKE_CURRENT_BINARY_DIR}/RISCVTargetObjectFile.h + ${CMAKE_CURRENT_BINARY_DIR}/RISCVAsmBackend.h + ${CMAKE_CURRENT_BINARY_DIR}/RISCVBaseInfo.h + ${CMAKE_CURRENT_BINARY_DIR}/RISCVELFStreamer.h + ${CMAKE_CURRENT_BINARY_DIR}/RISCVFixupKinds.h + ${CMAKE_CURRENT_BINARY_DIR}/RISCVInstPrinter.h + ${CMAKE_CURRENT_BINARY_DIR}/RISCVMatInt.h + ${CMAKE_CURRENT_BINARY_DIR}/RISCVMCAsmInfo.h + ${CMAKE_CURRENT_BINARY_DIR}/RISCVMCObjectFileInfo.h + ${CMAKE_CURRENT_BINARY_DIR}/RISCVMCTargetDesc.h + ${CMAKE_CURRENT_BINARY_DIR}/RISCVTargetStreamer.h LINK_COMPONENTS diff --git a/backend/llvm/lib/Target/RISCV/RISCVInstrInfoBuddyExt.td b/backend/llvm/lib/Target/RISCV/RISCVInstrInfoBuddyExt.td index adc172ab2f..e640e514f3 100644 --- a/backend/llvm/lib/Target/RISCV/RISCVInstrInfoBuddyExt.td +++ b/backend/llvm/lib/Target/RISCV/RISCVInstrInfoBuddyExt.td @@ -17,6 +17,22 @@ // This is the instruction information file of RISC-V buddy extension. // //===----------------------------------------------------------------------===// +// +// IMPORTANT: Instruction Encoding Fix (LLVM 21.0.0 RC1 Upgrade) +// ------------------------------------------------------------ +// The CONFIG_* instructions originally had identical encodings (func7=0b0000000), +// which caused decoding conflicts in LLVM's TableGen disassembler. +// +// Fixed encodings: +// - CONFIG_LD: func7 = 0b0010110 (was 0b0000000) +// - CONFIG_ST: func7 = 0b0010111 (was 0b0000000) +// - CONFIG_EX: func7 = 0b0011000 (was 0b0000000) +// - CONFIG_NORM: func7 = 0b0011001 (was 0b0000000) +// +// All instructions retain func3=0b011 and use OPC_CUSTOM_3. +// This ensures each instruction has a unique bit pattern for proper decoding. +// +//===----------------------------------------------------------------------===// include "llvm/IR/IntrinsicsRISCVBuddyExt.td" @@ -60,25 +76,25 @@ def FLUSH : RVInstR<0b0000111, 0b011, OPC_CUSTOM_3, (outs), } let Predicates = [HasBuddyExt] in -def CONFIG_LD : RVInstR<0b0000000, 0b011, OPC_CUSTOM_3, (outs), +def CONFIG_LD : RVInstR<0b0010110, 0b011, OPC_CUSTOM_3, (outs), (ins GPR:$rs1, GPR:$rs2), "config_ld", "$rs1, $rs2"> { let rd = 0; } let Predicates = [HasBuddyExt] in -def CONFIG_ST : RVInstR<0b0000000, 0b011, OPC_CUSTOM_3, (outs), +def CONFIG_ST : RVInstR<0b0010111, 0b011, OPC_CUSTOM_3, (outs), (ins GPR:$rs1, GPR:$rs2), "config_st", "$rs1, $rs2"> { let rd = 0; } let Predicates = [HasBuddyExt] in -def CONFIG_EX : RVInstR<0b0000000, 0b011, OPC_CUSTOM_3,(outs), +def CONFIG_EX : RVInstR<0b0011000, 0b011, OPC_CUSTOM_3,(outs), (ins GPR:$rs1, GPR:$rs2), "config_ex", "$rs1, $rs2"> { let rd = 0; } let Predicates = [HasBuddyExt] in -def CONFIG_NORM : RVInstR<0b0000000, 0b011, OPC_CUSTOM_3,(outs), +def CONFIG_NORM : RVInstR<0b0011001, 0b011, OPC_CUSTOM_3,(outs), (ins GPR:$rs1, GPR:$rs2), "config_norm", "$rs1, $rs2"> { let rd = 0; } diff --git a/backend/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/backend/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 669aae5850..9446121807 100644 --- a/backend/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/backend/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -8,7 +8,6 @@ add_llvm_component_library(LLVMBuddyVectorize ${LLVM_Vectorize_DIR}/Vectorize.cpp ${LLVM_Vectorize_DIR}/VectorCombine.cpp ${LLVM_Vectorize_DIR}/VPlan.cpp - ${LLVM_Vectorize_DIR}/VPlanHCFGBuilder.cpp ${LLVM_Vectorize_DIR}/VPlanRecipes.cpp ${LLVM_Vectorize_DIR}/VPlanSLP.cpp ${LLVM_Vectorize_DIR}/VPlanTransforms.cpp diff --git a/examples/BuddyConvolution/conv2d-nhwc-fhwc-opt.mlir b/examples/BuddyConvolution/conv2d-nhwc-fhwc-opt.mlir index 5155feee72..08592577c3 100644 --- a/examples/BuddyConvolution/conv2d-nhwc-fhwc-opt.mlir +++ b/examples/BuddyConvolution/conv2d-nhwc-fhwc-opt.mlir @@ -4,7 +4,7 @@ // RUN: --one-shot-bufferize="bufferize-function-boundaries" \ // RUN: -convert-scf-to-cf \ // RUN: -convert-cf-to-llvm \ -// RUN: -convert-vector-to-llvm \ +// RUN: -convert-vector-to-llvm -convert-ub-to-llvm \ // RUN: -convert-arith-to-llvm \ // RUN: -finalize-memref-to-llvm \ // RUN: -convert-func-to-llvm \ diff --git a/examples/BuddyNext/next-attention-fusion.mlir b/examples/BuddyNext/next-attention-fusion.mlir index 98c5d8e072..61af9aaec1 100644 --- a/examples/BuddyNext/next-attention-fusion.mlir +++ b/examples/BuddyNext/next-attention-fusion.mlir @@ -29,18 +29,13 @@ module { memref.global "private" constant @__constant_32x128x40xf32 : memref<32x128x40xf32> = dense<2.000000e+00> {alignment = 64 : i64} memref.global "private" constant @__constant_32x40x128xf32 : memref<32x40x128xf32> = dense<3.000000e+00> {alignment = 64 : i64} memref.global "private" constant @__constant_1x32x40x40xf32 : memref<1x32x40x40xf32> = dense<11.3137083> {alignment = 64 : i64} - func.func @kenerl(%arg0: tensor<32x40x128xf32>, %arg1: tensor<32x128x40xf32>, %arg2: tensor<1x1x40x40xf32>, %arg3: tensor<1x32x40x128xf32>) { + func.func @kenerl(%arg0: memref<32x40x128xf32>, %arg1: memref<32x128x40xf32>, %arg2: memref<1x1x40x40xf32>, %arg3: memref<1x32x40x128xf32>) { %t_start = call @rtclock() : () -> f64 %cst = arith.constant 0.0883883461 : f32 %c0 = arith.constant 0 : index %cst_0 = arith.constant 0.000000e+00 : f32 %cst_1 = arith.constant 1.000000e+00 : f32 %cst_2 = arith.constant -3.40282347E+38 : f32 - %0 = bufferization.to_memref %arg3 : tensor<1x32x40x128xf32> to memref<1x32x40x128xf32, strided<[?, ?, ?, ?], offset: ?>> - %1 = bufferization.to_memref %arg2 : tensor<1x1x40x40xf32> to memref<1x1x40x40xf32, strided<[?, ?, ?, ?], offset: ?>> - %2 = bufferization.to_memref %arg1 : tensor<32x128x40xf32> to memref<32x128x40xf32, strided<[?, ?, ?], offset: ?>> - %3 = bufferization.to_memref %arg0 : tensor<32x40x128xf32> to memref<32x40x128xf32, strided<[?, ?, ?], offset: ?>> - // MatMul // %0 = tosa.matmul %t0, %t1 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32> // Initialize MatMul Output. @@ -57,8 +52,8 @@ module { affine.for %arg5 = 0 to 40 { affine.for %arg6 = 0 to 40 { affine.for %arg7 = 0 to 128 { - %5 = affine.load %3[%arg4, %arg5, %arg7] : memref<32x40x128xf32, strided<[?, ?, ?], offset: ?>> - %6 = affine.load %2[%arg4, %arg7, %arg6] : memref<32x128x40xf32, strided<[?, ?, ?], offset: ?>> + %5 = affine.load %arg0[%arg4, %arg5, %arg7] : memref<32x40x128xf32> + %6 = affine.load %arg1[%arg4, %arg7, %arg6] : memref<32x128x40xf32> %7 = affine.load %alloc[%arg4, %arg5, %arg6] : memref<32x40x40xf32> %8 = arith.mulf %5, %6 : f32 %9 = arith.addf %7, %8 : f32 @@ -72,7 +67,8 @@ module { // %1 = tosa.reshape %0 {new_shape = array} : (tensor<32x40x40xf32>) -> tensor<1x32x40x40xf32> // %2 = "tosa.const"() <{value = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32> // %3 = tosa.reciprocal %2 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> - // %4 = tosa.mul %1, %3 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> + // %shift_4 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + // %4 = tosa.mul %1, %3, %shift_4 : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>, tensor<1xi8>) -> tensor<1x32x40x40xf32> // %5 = tosa.add %4, %t2 : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x32x40x40xf32> // %6 = tosa.reduce_max %5 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32> %expand_shape = memref.expand_shape %alloc [[0, 1], [2], [3]] output_shape [1, 32, 40, 40]: memref<32x40x40xf32> into memref<1x32x40x40xf32> @@ -93,7 +89,7 @@ module { // Fusion point: reshape + constant + reciprocal -> %cst %6 = arith.mulf %5, %cst : f32 // Fusion point: addition - %7 = affine.load %1[%c0, %c0, %arg6, %arg7] : memref<1x1x40x40xf32, strided<[?, ?, ?, ?], offset: ?>> + %7 = affine.load %arg2[%c0, %c0, %arg6, %arg7] : memref<1x1x40x40xf32> %8 = arith.addf %6, %7 : f32 // Fusion point: reduce max %9 = affine.load %alloc_6[%arg4, %arg5, %arg6] : memref<1x32x40xf32> @@ -142,7 +138,8 @@ module { // Fusion: Reciprocal + Multiplication // %10 = tosa.reciprocal %9 : (tensor<1x32x40x1xf32>) -> tensor<1x32x40x1xf32> - // %11 = tosa.mul %8, %10 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32> + // %shift_11 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + // %11 = tosa.mul %8, %10, %shift_11 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>, tensor<1xi8>) -> tensor<1x32x40x40xf32> %expand_shape_11 = memref.expand_shape %alloc_10 [[0], [1], [2, 3]] output_shape [1, 32, 40, 1]: memref<1x32x40xf32> into memref<1x32x40x1xf32> %alloc_13 = memref.alloc() {alignment = 64 : i64} : memref<1x32x40x40xf32> affine.for %arg4 = 0 to 1 { @@ -171,7 +168,7 @@ module { %collapse_shape = memref.collapse_shape %alloc_13 [[0, 1], [2], [3]] : memref<1x32x40x40xf32> into memref<32x40x40xf32> %alloc_14 = memref.alloc() {alignment = 64 : i64} : memref<1x32x40x128xf32> // SSA value %0 is from %arg3 - memref.copy %0, %alloc_14 : memref<1x32x40x128xf32, strided<[?, ?, ?, ?], offset: ?>> to memref<1x32x40x128xf32> + memref.copy %arg3, %alloc_14 : memref<1x32x40x128xf32> to memref<1x32x40x128xf32> %collapse_shape_15 = memref.collapse_shape %alloc_14 [[0, 1], [2], [3]] : memref<1x32x40x128xf32> into memref<32x40x128xf32> // MatMul @@ -222,14 +219,10 @@ module { } func.func @main() { %0 = memref.get_global @__constant_32x40x128xf32 : memref<32x40x128xf32> - %1 = bufferization.to_tensor %0 restrict: memref<32x40x128xf32> to tensor<32x40x128xf32> - %2 = memref.get_global @__constant_32x128x40xf32 : memref<32x128x40xf32> - %3 = bufferization.to_tensor %2 restrict: memref<32x128x40xf32> to tensor<32x128x40xf32> - %4 = memref.get_global @__constant_1x1x40x40xf32 : memref<1x1x40x40xf32> - %5 = bufferization.to_tensor %4 restrict: memref<1x1x40x40xf32> to tensor<1x1x40x40xf32> - %6 = memref.get_global @__constant_1x32x40x128xf32 : memref<1x32x40x128xf32> - %7 = bufferization.to_tensor %6 restrict: memref<1x32x40x128xf32> to tensor<1x32x40x128xf32> - call @kenerl(%1, %3, %5, %7) : (tensor<32x40x128xf32>, tensor<32x128x40xf32>, tensor<1x1x40x40xf32>, tensor<1x32x40x128xf32>) -> () + %1 = memref.get_global @__constant_32x128x40xf32 : memref<32x128x40xf32> + %2 = memref.get_global @__constant_1x1x40x40xf32 : memref<1x1x40x40xf32> + %3 = memref.get_global @__constant_1x32x40x128xf32 : memref<1x32x40x128xf32> + call @kenerl(%0, %1, %2, %3) : (memref<32x40x128xf32>, memref<32x128x40xf32>, memref<1x1x40x40xf32>, memref<1x32x40x128xf32>) -> () return } func.func private @printMemrefF32(memref<*xf32>) diff --git a/examples/BuddyNext/next-attention-loop.mlir b/examples/BuddyNext/next-attention-loop.mlir index aff05a5d6d..21656e2a63 100644 --- a/examples/BuddyNext/next-attention-loop.mlir +++ b/examples/BuddyNext/next-attention-loop.mlir @@ -29,18 +29,13 @@ module { memref.global "private" constant @__constant_32x128x40xf32 : memref<32x128x40xf32> = dense<2.000000e+00> {alignment = 64 : i64} memref.global "private" constant @__constant_32x40x128xf32 : memref<32x40x128xf32> = dense<3.000000e+00> {alignment = 64 : i64} memref.global "private" constant @__constant_1x32x40x40xf32 : memref<1x32x40x40xf32> = dense<11.3137083> {alignment = 64 : i64} - func.func @kenerl(%arg0: tensor<32x40x128xf32>, %arg1: tensor<32x128x40xf32>, %arg2: tensor<1x1x40x40xf32>, %arg3: tensor<1x32x40x128xf32>) { + func.func @kenerl(%arg0: memref<32x40x128xf32>, %arg1: memref<32x128x40xf32>, %arg2: memref<1x1x40x40xf32>, %arg3: memref<1x32x40x128xf32>) { %t_start = call @rtclock() : () -> f64 %cst = arith.constant 0.0883883461 : f32 %c0 = arith.constant 0 : index %cst_0 = arith.constant 0.000000e+00 : f32 %cst_1 = arith.constant 1.000000e+00 : f32 %cst_2 = arith.constant -3.40282347E+38 : f32 - %0 = bufferization.to_memref %arg3 : tensor<1x32x40x128xf32> to memref<1x32x40x128xf32, strided<[?, ?, ?, ?], offset: ?>> - %1 = bufferization.to_memref %arg2 : tensor<1x1x40x40xf32> to memref<1x1x40x40xf32, strided<[?, ?, ?, ?], offset: ?>> - %2 = bufferization.to_memref %arg1 : tensor<32x128x40xf32> to memref<32x128x40xf32, strided<[?, ?, ?], offset: ?>> - %3 = bufferization.to_memref %arg0 : tensor<32x40x128xf32> to memref<32x40x128xf32, strided<[?, ?, ?], offset: ?>> - // MatMul // %0 = tosa.matmul %t0, %t1 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32> // Initialize MatMul Output. @@ -57,8 +52,8 @@ module { affine.for %arg5 = 0 to 40 { affine.for %arg6 = 0 to 40 { affine.for %arg7 = 0 to 128 { - %5 = affine.load %3[%arg4, %arg5, %arg7] : memref<32x40x128xf32, strided<[?, ?, ?], offset: ?>> - %6 = affine.load %2[%arg4, %arg7, %arg6] : memref<32x128x40xf32, strided<[?, ?, ?], offset: ?>> + %5 = affine.load %arg0[%arg4, %arg5, %arg7] : memref<32x40x128xf32> + %6 = affine.load %arg1[%arg4, %arg7, %arg6] : memref<32x128x40xf32> %7 = affine.load %alloc[%arg4, %arg5, %arg6] : memref<32x40x40xf32> %8 = arith.mulf %5, %6 : f32 %9 = arith.addf %7, %8 : f32 @@ -85,7 +80,8 @@ module { } // Multiplication - // %4 = tosa.mul %1, %3 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> + // %shift_4 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + // %4 = tosa.mul %1, %3, %shift_4 : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>, tensor<1xi8>) -> tensor<1x32x40x40xf32> %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<1x32x40x40xf32> affine.for %arg4 = 0 to 1 { affine.for %arg5 = 0 to 32 { @@ -108,7 +104,7 @@ module { affine.for %arg6 = 0 to 40 { affine.for %arg7 = 0 to 40 { %5 = affine.load %alloc_4[%c0, %arg5, %arg6, %arg7] : memref<1x32x40x40xf32> - %6 = affine.load %1[%c0, %c0, %arg6, %arg7] : memref<1x1x40x40xf32, strided<[?, ?, ?, ?], offset: ?>> + %6 = affine.load %arg2[%c0, %c0, %arg6, %arg7] : memref<1x1x40x40xf32> %7 = arith.addf %5, %6 : f32 affine.store %7, %alloc_5[%arg4, %arg5, %arg6, %arg7] : memref<1x32x40x40xf32> } @@ -220,7 +216,8 @@ module { } // Multiplication - // %11 = tosa.mul %8, %10 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32> + // %shift_11 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + // %11 = tosa.mul %8, %10, %shift_11 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>, tensor<1xi8>) -> tensor<1x32x40x40xf32> %alloc_13 = memref.alloc() {alignment = 64 : i64} : memref<1x32x40x40xf32> affine.for %arg4 = 0 to 1 { affine.for %arg5 = 0 to 32 { @@ -245,7 +242,7 @@ module { %collapse_shape = memref.collapse_shape %alloc_13 [[0, 1], [2], [3]] : memref<1x32x40x40xf32> into memref<32x40x40xf32> %alloc_14 = memref.alloc() {alignment = 64 : i64} : memref<1x32x40x128xf32> // SSA value %0 is from %arg3 - memref.copy %0, %alloc_14 : memref<1x32x40x128xf32, strided<[?, ?, ?, ?], offset: ?>> to memref<1x32x40x128xf32> + memref.copy %arg3, %alloc_14 : memref<1x32x40x128xf32> to memref<1x32x40x128xf32> %collapse_shape_15 = memref.collapse_shape %alloc_14 [[0, 1], [2], [3]] : memref<1x32x40x128xf32> into memref<32x40x128xf32> // MatMul @@ -297,14 +294,10 @@ module { } func.func @main() { %0 = memref.get_global @__constant_32x40x128xf32 : memref<32x40x128xf32> - %1 = bufferization.to_tensor %0 restrict: memref<32x40x128xf32> to tensor<32x40x128xf32> - %2 = memref.get_global @__constant_32x128x40xf32 : memref<32x128x40xf32> - %3 = bufferization.to_tensor %2 restrict: memref<32x128x40xf32> to tensor<32x128x40xf32> - %4 = memref.get_global @__constant_1x1x40x40xf32 : memref<1x1x40x40xf32> - %5 = bufferization.to_tensor %4 restrict: memref<1x1x40x40xf32> to tensor<1x1x40x40xf32> - %6 = memref.get_global @__constant_1x32x40x128xf32 : memref<1x32x40x128xf32> - %7 = bufferization.to_tensor %6 restrict: memref<1x32x40x128xf32> to tensor<1x32x40x128xf32> - call @kenerl(%1, %3, %5, %7) : (tensor<32x40x128xf32>, tensor<32x128x40xf32>, tensor<1x1x40x40xf32>, tensor<1x32x40x128xf32>) -> () + %1 = memref.get_global @__constant_32x128x40xf32 : memref<32x128x40xf32> + %2 = memref.get_global @__constant_1x1x40x40xf32 : memref<1x1x40x40xf32> + %3 = memref.get_global @__constant_1x32x40x128xf32 : memref<1x32x40x128xf32> + call @kenerl(%0, %1, %2, %3) : (memref<32x40x128xf32>, memref<32x128x40xf32>, memref<1x1x40x40xf32>, memref<1x32x40x128xf32>) -> () return } func.func private @printMemrefF32(memref<*xf32>) diff --git a/examples/BuddyNext/next-attention.mlir b/examples/BuddyNext/next-attention.mlir index 0dad20b11a..ad1b025804 100644 --- a/examples/BuddyNext/next-attention.mlir +++ b/examples/BuddyNext/next-attention.mlir @@ -33,25 +33,34 @@ func.func private @rtclock() -> f64 func.func @kernel(%t0 : tensor<32x40x128xf32>, %t1 : tensor<32x128x40xf32>, %t2 : tensor<1x1x40x40xf32>, %t3 : tensor<1x32x40x128xf32>) { %t_start = call @rtclock() : () -> f64 - %0 = tosa.matmul %t0, %t1 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32> - %1 = tosa.reshape %0 {new_shape = array} : (tensor<32x40x40xf32>) -> tensor<1x32x40x40xf32> - %2 = "tosa.const"() <{value = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32> + %zp1_0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> + %zp2_0 = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> + %0 = tosa.matmul %t0, %t1, %zp1_0, %zp2_0 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<32x40x40xf32> + %shape_1 = tosa.const_shape {values = dense<[1, 32, 40, 40]> : tensor<4xindex>} : () -> !tosa.shape<4> + %1 = tosa.reshape %0, %shape_1 : (tensor<32x40x40xf32>, !tosa.shape<4>) -> tensor<1x32x40x40xf32> + %2 = "tosa.const"() <{values = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32> %3 = tosa.reciprocal %2 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> - %4 = tosa.mul %1, %3 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> + %shift_4 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %4 = tosa.mul %1, %3, %shift_4 : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>, tensor<1xi8>) -> tensor<1x32x40x40xf32> %5 = tosa.add %4, %t2 : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x32x40x40xf32> %6 = tosa.reduce_max %5 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32> %7 = tosa.sub %5, %6 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32> %8 = tosa.exp %7 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> %9 = tosa.reduce_sum %8 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32> %10 = tosa.reciprocal %9 : (tensor<1x32x40x1xf32>) -> tensor<1x32x40x1xf32> - %11 = tosa.mul %8, %10 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32> - %12 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32> + %shift_11 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %11 = tosa.mul %8, %10, %shift_11 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>, tensor<1xi8>) -> tensor<1x32x40x40xf32> + %12 = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32> %13 = tosa.add %11, %12 : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> - %14 = tosa.reshape %13 {new_shape = array} : (tensor<1x32x40x40xf32>) -> tensor<32x40x40xf32> - %15 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32> + %shape_14 = tosa.const_shape {values = dense<[32, 40, 40]> : tensor<3xindex>} : () -> !tosa.shape<3> + %14 = tosa.reshape %13, %shape_14 : (tensor<1x32x40x40xf32>, !tosa.shape<3>) -> tensor<32x40x40xf32> + %15 = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32> %16 = tosa.add %t3, %15 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32> - %17 = tosa.reshape %16 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32> - %18 = tosa.matmul %14, %17 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32> + %shape_17 = tosa.const_shape {values = dense<[32, 40, 128]> : tensor<3xindex>} : () -> !tosa.shape<3> + %17 = tosa.reshape %16, %shape_17 : (tensor<1x32x40x128xf32>, !tosa.shape<3>) -> tensor<32x40x128xf32> + %zp1_18 = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> + %zp2_18 = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> + %18 = tosa.matmul %14, %17, %zp1_18, %zp2_18 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<32x40x128xf32> %t_end = call @rtclock() : () -> f64 %time = arith.subf %t_end, %t_start : f64 diff --git a/examples/BuddyNext/next-eliminate-add-zero.mlir b/examples/BuddyNext/next-eliminate-add-zero.mlir index bf3cb85f5c..86cb9f00d0 100644 --- a/examples/BuddyNext/next-eliminate-add-zero.mlir +++ b/examples/BuddyNext/next-eliminate-add-zero.mlir @@ -35,9 +35,10 @@ module { %t0_original = call @rtclock() : () -> f64 %84 = arith.constant dense<2.0> : tensor<1x32x40x128xf32> - %92 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32> + %92 = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32> %93 = tosa.add %84, %92 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32> - %94 = tosa.reshape %93 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32> + %shape_94 = tosa.const_shape {values = dense<[32, 40, 128]> : tensor<3xindex>} : () -> !tosa.shape<3> + %94 = tosa.reshape %93, %shape_94 : (tensor<1x32x40x128xf32>, !tosa.shape<3>) -> tensor<32x40x128xf32> %t1_original = call @rtclock() : () -> f64 %tensor_unranked = tensor.cast %94 : tensor<32x40x128xf32> to tensor<*xf32> @@ -63,7 +64,8 @@ module { %t0_optimized = call @rtclock() : () -> f64 %84 = arith.constant dense<2.0> : tensor<1x32x40x128xf32> - %94 = tosa.reshape %84 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32> + %shape_94 = tosa.const_shape {values = dense<[32, 40, 128]> : tensor<3xindex>} : () -> !tosa.shape<3> + %94 = tosa.reshape %84, %shape_94 : (tensor<1x32x40x128xf32>, !tosa.shape<3>) -> tensor<32x40x128xf32> %t1_optimized = call @rtclock() : () -> f64 %tensor_unranked = tensor.cast %94 : tensor<32x40x128xf32> to tensor<*xf32> diff --git a/examples/BuddyNext/next-eliminate-identity.mlir b/examples/BuddyNext/next-eliminate-identity.mlir index b4af852f52..e8c86afff3 100644 --- a/examples/BuddyNext/next-eliminate-identity.mlir +++ b/examples/BuddyNext/next-eliminate-identity.mlir @@ -36,7 +36,8 @@ module { %119 = arith.constant dense<1.0> : tensor<1x40x32x128xf32> %120 = tosa.identity %119 : (tensor<1x40x32x128xf32>) -> tensor<1x40x32x128xf32> - %121 = tosa.reshape %120 {new_shape = array} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32> + %shape_121 = tosa.const_shape {values = dense<[1, 40, 4096]> : tensor<3xindex>} : () -> !tosa.shape<3> + %121 = tosa.reshape %120, %shape_121 : (tensor<1x40x32x128xf32>, !tosa.shape<3>) -> tensor<1x40x4096xf32> %t1_original = call @rtclock() : () -> f64 %tensor_unranked = tensor.cast %121 : tensor<1x40x4096xf32> to tensor<*xf32> @@ -61,7 +62,8 @@ module { %t0_optimized = call @rtclock() : () -> f64 %119 = arith.constant dense<1.0> : tensor<1x40x32x128xf32> - %121 = tosa.reshape %119 {new_shape = array} : (tensor<1x40x32x128xf32>) -> tensor<1x40x4096xf32> + %shape_121 = tosa.const_shape {values = dense<[1, 40, 4096]> : tensor<3xindex>} : () -> !tosa.shape<3> + %121 = tosa.reshape %119, %shape_121 : (tensor<1x40x32x128xf32>, !tosa.shape<3>) -> tensor<1x40x4096xf32> %t1_optimized = call @rtclock() : () -> f64 %tensor_unranked = tensor.cast %121 : tensor<1x40x4096xf32> to tensor<*xf32> diff --git a/examples/BuddyNext/next-embedding.mlir b/examples/BuddyNext/next-embedding.mlir index aaabf8cb13..482bbbdf9f 100644 --- a/examples/BuddyNext/next-embedding.mlir +++ b/examples/BuddyNext/next-embedding.mlir @@ -33,13 +33,17 @@ func.func private @rtclock() -> f64 func.func @kernel(%t0: tensor<32000x4096xf32>, %t1: tensor<1x40xi64>) { %t_start = call @rtclock() : () -> f64 - %0 = "tosa.const"() <{value = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]> : tensor<40xi64>}> : () -> tensor<40xi64> - %1 = tosa.reshape %0 {new_shape = array} : (tensor<40xi64>) -> tensor<1x40xi64> - %2 = tosa.reshape %1 {new_shape = array} : (tensor<1x40xi64>) -> tensor<1x40xi64> + %0 = "tosa.const"() <{values = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]> : tensor<40xi64>}> : () -> tensor<40xi64> + %shape_1 = tosa.const_shape {values = dense<[1, 40]> : tensor<2xindex>} : () -> !tosa.shape<2> + %1 = tosa.reshape %0, %shape_1 : (tensor<40xi64>, !tosa.shape<2>) -> tensor<1x40xi64> + %shape_2 = tosa.const_shape {values = dense<[1, 40]> : tensor<2xindex>} : () -> !tosa.shape<2> + %2 = tosa.reshape %1, %shape_2 : (tensor<1x40xi64>, !tosa.shape<2>) -> tensor<1x40xi64> %3 = tosa.cast %t1 : (tensor<1x40xi64>) -> tensor<1x40xi32> - %4 = tosa.reshape %t0 {new_shape = array} : (tensor<32000x4096xf32>) -> tensor<1x32000x4096xf32> + %shape_4 = tosa.const_shape {values = dense<[1, 32000, 4096]> : tensor<3xindex>} : () -> !tosa.shape<3> + %4 = tosa.reshape %t0, %shape_4 : (tensor<32000x4096xf32>, !tosa.shape<3>) -> tensor<1x32000x4096xf32> %5 = tosa.gather %4, %3 : (tensor<1x32000x4096xf32>, tensor<1x40xi32>) -> tensor<1x40x4096xf32> - %6 = tosa.reshape %5 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32> + %shape_6 = tosa.const_shape {values = dense<[1, 40, 4096]> : tensor<3xindex>} : () -> !tosa.shape<3> + %6 = tosa.reshape %5, %shape_6 : (tensor<1x40x4096xf32>, !tosa.shape<3>) -> tensor<1x40x4096xf32> %t_end = call @rtclock() : () -> f64 %time = arith.subf %t_end, %t_start : f64 diff --git a/examples/BuddyNext/next-ffn.mlir b/examples/BuddyNext/next-ffn.mlir index 0ec595f454..ebd40d89f7 100644 --- a/examples/BuddyNext/next-ffn.mlir +++ b/examples/BuddyNext/next-ffn.mlir @@ -53,38 +53,51 @@ func.func @kernel(%t0: tensor<1x40x4096xf32>, %t1: tensor<4096xf32>, %t2: tensor linalg.yield %4175 : f32 } -> tensor<1x40x4096xf32> %130 = tosa.reduce_sum %129 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32> - %131 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32> + %131 = "tosa.const"() <{values = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32> %132 = tosa.reciprocal %131 : (tensor<1xf32>) -> tensor<1xf32> - %shift = "tosa.const"() <{value = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> - %temp = tosa.reshape %132 {new_shape = array} : (tensor<1xf32>) -> tensor<1x1x1xf32> - %133 = tosa.mul %temp, %130 {shift = 0 : i8} : (tensor<1x1x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32> - %134 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32> + %shift = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %shape_temp = tosa.const_shape {values = dense<[1, 1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3> + %temp = tosa.reshape %132, %shape_temp : (tensor<1xf32>, !tosa.shape<3>) -> tensor<1x1x1xf32> + %shift_133 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %133 = tosa.mul %temp, %130, %shift_133 : (tensor<1x1x1xf32>, tensor<1x40x1xf32>, tensor<1xi8>) -> tensor<1x40x1xf32> + %134 = "tosa.const"() <{values = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32> %135 = tosa.add %133, %134 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32> %136 = tosa.rsqrt %135 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32> - %137 = tosa.mul %t0, %136 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32> - %138 = tosa.reshape %t1 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32> - %139 = tosa.mul %138, %137 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32> - %140 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> - %141 = tosa.transpose %t2, %140 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32> - %142 = tosa.reshape %139 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %shift_137 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %137 = tosa.mul %t0, %136, %shift_137 : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>, tensor<1xi8>) -> tensor<1x40x4096xf32> + %shape_138 = tosa.const_shape {values = dense<[1, 1, 4096]> : tensor<3xindex>} : () -> !tosa.shape<3> + %138 = tosa.reshape %t1, %shape_138 : (tensor<4096xf32>, !tosa.shape<3>) -> tensor<1x1x4096xf32> + %shift_139 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %139 = tosa.mul %138, %137, %shift_139 : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>, tensor<1xi8>) -> tensor<1x40x4096xf32> + %140 = "tosa.const"() <{values = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %141 = tosa.transpose %t2 {perms = array} : (tensor<11008x4096xf32>) -> tensor<4096x11008xf32> + %shape_142 = tosa.const_shape {values = dense<[40, 4096]> : tensor<2xindex>} : () -> !tosa.shape<2> + %142 = tosa.reshape %139, %shape_142 : (tensor<1x40x4096xf32>, !tosa.shape<2>) -> tensor<40x4096xf32> %cst_24 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32> %143 = linalg.matmul {cast = #linalg.type_fn} ins(%142, %141 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_24 : tensor<40x11008xf32>) -> tensor<40x11008xf32> - %144 = tosa.reshape %143 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32> + %shape_144 = tosa.const_shape {values = dense<[1, 40, 11008]> : tensor<3xindex>} : () -> !tosa.shape<3> + %144 = tosa.reshape %143, %shape_144 : (tensor<40x11008xf32>, !tosa.shape<3>) -> tensor<1x40x11008xf32> %145 = tosa.sigmoid %144 : (tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32> - %146 = tosa.mul %144, %145 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32> - %147 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> - %148 = tosa.transpose %t3, %147 : (tensor<11008x4096xf32>, tensor<2xi32>) -> tensor<4096x11008xf32> - %149 = tosa.reshape %139 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %shift_146 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %146 = tosa.mul %144, %145, %shift_146 : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>, tensor<1xi8>) -> tensor<1x40x11008xf32> + %147 = "tosa.const"() <{values = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %148 = tosa.transpose %t3 {perms = array} : (tensor<11008x4096xf32>) -> tensor<4096x11008xf32> + %shape_149 = tosa.const_shape {values = dense<[40, 4096]> : tensor<2xindex>} : () -> !tosa.shape<2> + %149 = tosa.reshape %139, %shape_149 : (tensor<1x40x4096xf32>, !tosa.shape<2>) -> tensor<40x4096xf32> %cst_25 = arith.constant dense<0.000000e+00> : tensor<40x11008xf32> %150 = linalg.matmul {cast = #linalg.type_fn} ins(%149, %148 : tensor<40x4096xf32>, tensor<4096x11008xf32>) outs(%cst_25 : tensor<40x11008xf32>) -> tensor<40x11008xf32> - %151 = tosa.reshape %150 {new_shape = array} : (tensor<40x11008xf32>) -> tensor<1x40x11008xf32> - %152 = tosa.mul %146, %151 {shift = 0 : i8} : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>) -> tensor<1x40x11008xf32> - %153 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> - %154 = tosa.transpose %t4, %153 : (tensor<4096x11008xf32>, tensor<2xi32>) -> tensor<11008x4096xf32> - %155 = tosa.reshape %152 {new_shape = array} : (tensor<1x40x11008xf32>) -> tensor<40x11008xf32> + %shape_151 = tosa.const_shape {values = dense<[1, 40, 11008]> : tensor<3xindex>} : () -> !tosa.shape<3> + %151 = tosa.reshape %150, %shape_151 : (tensor<40x11008xf32>, !tosa.shape<3>) -> tensor<1x40x11008xf32> + %shift_152 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %152 = tosa.mul %146, %151, %shift_152 : (tensor<1x40x11008xf32>, tensor<1x40x11008xf32>, tensor<1xi8>) -> tensor<1x40x11008xf32> + %153 = "tosa.const"() <{values = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %154 = tosa.transpose %t4 {perms = array} : (tensor<4096x11008xf32>) -> tensor<11008x4096xf32> + %shape_155 = tosa.const_shape {values = dense<[40, 11008]> : tensor<2xindex>} : () -> !tosa.shape<2> + %155 = tosa.reshape %152, %shape_155 : (tensor<1x40x11008xf32>, !tosa.shape<2>) -> tensor<40x11008xf32> %cst_26 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32> %156 = linalg.matmul {cast = #linalg.type_fn} ins(%155, %154 : tensor<40x11008xf32>, tensor<11008x4096xf32>) outs(%cst_26 : tensor<40x4096xf32>) -> tensor<40x4096xf32> - %157 = tosa.reshape %156 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + %shape_157 = tosa.const_shape {values = dense<[1, 40, 4096]> : tensor<3xindex>} : () -> !tosa.shape<3> + %157 = tosa.reshape %156, %shape_157 : (tensor<40x4096xf32>, !tosa.shape<3>) -> tensor<1x40x4096xf32> %158 = tosa.add %t0, %157 : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32> %t_end = call @rtclock() : () -> f64 diff --git a/examples/BuddyNext/next-mask.mlir b/examples/BuddyNext/next-mask.mlir index f13fc9a3bb..7251d5709a 100644 --- a/examples/BuddyNext/next-mask.mlir +++ b/examples/BuddyNext/next-mask.mlir @@ -45,10 +45,11 @@ func.func @kernel() { %cst = arith.constant dense : tensor<1x40xi1> %cst_0 = arith.constant dense<-3.40282347E+38> : tensor<40x40xf32> - %7 = "tosa.const"() <{value = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]> : tensor<40xi64>}> : () -> tensor<40xi64> - %8 = "tosa.const"() <{value = dense<1> : tensor<40xi64>}> : () -> tensor<40xi64> + %7 = "tosa.const"() <{values = dense<[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39]> : tensor<40xi64>}> : () -> tensor<40xi64> + %8 = "tosa.const"() <{values = dense<1> : tensor<40xi64>}> : () -> tensor<40xi64> %9 = tosa.add %7, %8 : (tensor<40xi64>, tensor<40xi64>) -> tensor<40xi64> - %10 = tosa.reshape %9 {new_shape = array} : (tensor<40xi64>) -> tensor<40x1xi64> + %shape_10 = tosa.const_shape {values = dense<[40, 1]> : tensor<2xindex>} : () -> !tosa.shape<2> + %10 = tosa.reshape %9, %shape_10 : (tensor<40xi64>, !tosa.shape<2>) -> tensor<40x1xi64> %11 = tensor.empty() : tensor<40x40xi1> %12 = linalg.generic {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<40xi64>, tensor<40x1xi64>) outs(%11 : tensor<40x40xi1>) { ^bb0(%in: i64, %in_742: i64, %out: i1): @@ -63,13 +64,15 @@ func.func @kernel() { linalg.yield %4175 : f32 } -> tensor<40x40xf32> %extracted_slice = tensor.extract_slice %cst[0, 0] [1, 40] [1, 1] : tensor<1x40xi1> to tensor<1x40xi1> - %15 = tosa.reshape %extracted_slice {new_shape = array} : (tensor<1x40xi1>) -> tensor<1x1x40xi1> - %16 = tosa.reshape %15 {new_shape = array} : (tensor<1x1x40xi1>) -> tensor<1x1x1x40xi1> + %shape_15 = tosa.const_shape {values = dense<[1, 1, 40]> : tensor<3xindex>} : () -> !tosa.shape<3> + %15 = tosa.reshape %extracted_slice, %shape_15 : (tensor<1x40xi1>, !tosa.shape<3>) -> tensor<1x1x40xi1> + %shape_16 = tosa.const_shape {values = dense<[1, 1, 1, 40]> : tensor<4xindex>} : () -> !tosa.shape<4> + %16 = tosa.reshape %15, %shape_16 : (tensor<1x1x40xi1>, !tosa.shape<4>) -> tensor<1x1x1x40xi1> %extracted_slice_2 = tensor.extract_slice %16[0, 0, 0, 0] [1, 1, 1, 40] [1, 1, 1, 1] : tensor<1x1x1x40xi1> to tensor<1x1x1x40xi1> - %17 = "tosa.const"() <{value = dense : tensor<1x1x40x40xi1>}> : () -> tensor<1x1x40x40xi1> + %17 = "tosa.const"() <{values = dense : tensor<1x1x40x40xi1>}> : () -> tensor<1x1x40x40xi1> %18 = tosa.add %extracted_slice_2, %17 : (tensor<1x1x1x40xi1>, tensor<1x1x40x40xi1>) -> tensor<1x1x40x40xi1> %19 = tosa.cast %18 : (tensor<1x1x40x40xi1>) -> tensor<1x1x40x40xf32> - %20 = "tosa.const"() <{value = dense<1.000000e+00> : tensor<1x1x40x40xf32>}> : () -> tensor<1x1x40x40xf32> + %20 = "tosa.const"() <{values = dense<1.000000e+00> : tensor<1x1x40x40xf32>}> : () -> tensor<1x1x40x40xf32> %21 = tosa.sub %20, %19 : (tensor<1x1x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x1x40x40xf32> %22 = tosa.cast %21 : (tensor<1x1x40x40xf32>) -> tensor<1x1x40x40xi1> %cst_3 = arith.constant -3.40282347E+38 : f32 @@ -79,11 +82,13 @@ func.func @kernel() { %4175 = arith.select %in, %cst_3, %in_742 : f32 linalg.yield %4175 : f32 } -> tensor<1x1x40x40xf32> - %25 = tosa.reshape %14 {new_shape = array} : (tensor<40x40xf32>) -> tensor<1x40x40xf32> - %26 = tosa.reshape %25 {new_shape = array} : (tensor<1x40x40xf32>) -> tensor<1x1x40x40xf32> + %shape_25 = tosa.const_shape {values = dense<[1, 40, 40]> : tensor<3xindex>} : () -> !tosa.shape<3> + %25 = tosa.reshape %14, %shape_25 : (tensor<40x40xf32>, !tosa.shape<3>) -> tensor<1x40x40xf32> + %shape_26 = tosa.const_shape {values = dense<[1, 1, 40, 40]> : tensor<4xindex>} : () -> !tosa.shape<4> + %26 = tosa.reshape %25, %shape_26 : (tensor<1x40x40xf32>, !tosa.shape<4>) -> tensor<1x1x40x40xf32> %extracted_slice_4 = tensor.extract_slice %26[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x40xf32> to tensor<1x1x40x40xf32> %extracted_slice_5 = tensor.extract_slice %extracted_slice_4[0, 0, 0, 0] [1, 1, 40, 40] [1, 1, 1, 1] : tensor<1x1x40x40xf32> to tensor<1x1x40x40xf32> - %27 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x1x40x40xf32>}> : () -> tensor<1x1x40x40xf32> + %27 = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x1x40x40xf32>}> : () -> tensor<1x1x40x40xf32> %28 = tosa.add %extracted_slice_5, %27 : (tensor<1x1x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x1x40x40xf32> %29 = tosa.add %24, %28 : (tensor<1x1x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x1x40x40xf32> diff --git a/examples/BuddyNext/next-matmul-transpose2.mlir b/examples/BuddyNext/next-matmul-transpose2.mlir index b15150c76f..055f5ed862 100644 --- a/examples/BuddyNext/next-matmul-transpose2.mlir +++ b/examples/BuddyNext/next-matmul-transpose2.mlir @@ -26,13 +26,15 @@ func.func private @printMemrefF32(tensor<*xf32>) func.func @test(%a : tensor<1x40x32x128xf32>, %b : tensor<32x40x40xf32>) -> (tensor<1x40x32x128xf32>) { %t_start = call @rtclock() : () -> f64 - %0 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32> - %1 = tosa.transpose %a, %0 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32> - %2 = tosa.reshape %1 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32> - %3 = tosa.matmul %b, %2 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32> - %4 = tosa.reshape %3 {new_shape = array} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32> - %5 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32> - %6 = tosa.transpose %4, %5 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32> + %1 = tosa.transpose %a {perms = array} : (tensor<1x40x32x128xf32>) -> tensor<1x32x40x128xf32> + %shape_2 = tosa.const_shape {values = dense<[32, 40, 128]> : tensor<3xindex>} : () -> !tosa.shape<3> + %2 = tosa.reshape %1, %shape_2 : (tensor<1x32x40x128xf32>, !tosa.shape<3>) -> tensor<32x40x128xf32> + %zp1_3 = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> + %zp2_3 = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> + %3 = tosa.matmul %b, %2, %zp1_3, %zp2_3 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<32x40x128xf32> + %shape_4 = tosa.const_shape {values = dense<[1, 32, 40, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %4 = tosa.reshape %3, %shape_4 : (tensor<32x40x128xf32>, !tosa.shape<4>) -> tensor<1x32x40x128xf32> + %6 = tosa.transpose %4 {perms = array} : (tensor<1x32x40x128xf32>) -> tensor<1x40x32x128xf32> %t_end = call @rtclock() : () -> f64 %time = arith.subf %t_end, %t_start : f64 // Print timings. diff --git a/examples/BuddyNext/next-mhsa-context.mlir b/examples/BuddyNext/next-mhsa-context.mlir index bd1d8d12d6..607eb1a97d 100644 --- a/examples/BuddyNext/next-mhsa-context.mlir +++ b/examples/BuddyNext/next-mhsa-context.mlir @@ -34,14 +34,19 @@ func.func private @printMemrefF32(%ptr : tensor<*xf32>) func.func @kernel(%t0: tensor<1x32x40x40xf32>, %t1: tensor<1x32x40x128xf32>) { %t_start = call @rtclock() : () -> f64 - %110 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32> + %110 = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32> %111 = tosa.add %t0, %110 : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> - %112 = tosa.reshape %111 {new_shape = array} : (tensor<1x32x40x40xf32>) -> tensor<32x40x40xf32> - %113 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32> + %shape_112 = tosa.const_shape {values = dense<[32, 40, 40]> : tensor<3xindex>} : () -> !tosa.shape<3> + %112 = tosa.reshape %111, %shape_112 : (tensor<1x32x40x40xf32>, !tosa.shape<3>) -> tensor<32x40x40xf32> + %113 = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32> %114 = tosa.add %t1, %113 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32> - %115 = tosa.reshape %114 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32> - %116 = tosa.matmul %112, %115 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>) -> tensor<32x40x128xf32> - %117 = tosa.reshape %116 {new_shape = array} : (tensor<32x40x128xf32>) -> tensor<1x32x40x128xf32> + %shape_115 = tosa.const_shape {values = dense<[32, 40, 128]> : tensor<3xindex>} : () -> !tosa.shape<3> + %115 = tosa.reshape %114, %shape_115 : (tensor<1x32x40x128xf32>, !tosa.shape<3>) -> tensor<32x40x128xf32> + %zp1_116 = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> + %zp2_116 = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> + %116 = tosa.matmul %112, %115, %zp1_116, %zp2_116 : (tensor<32x40x40xf32>, tensor<32x40x128xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<32x40x128xf32> + %shape_117 = tosa.const_shape {values = dense<[1, 32, 40, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %117 = tosa.reshape %116, %shape_117 : (tensor<32x40x128xf32>, !tosa.shape<4>) -> tensor<1x32x40x128xf32> %t_end = call @rtclock() : () -> f64 %time = arith.subf %t_end, %t_start : f64 diff --git a/examples/BuddyNext/next-mhsa-core.mlir b/examples/BuddyNext/next-mhsa-core.mlir index 3dd39622a3..5f802571b1 100644 --- a/examples/BuddyNext/next-mhsa-core.mlir +++ b/examples/BuddyNext/next-mhsa-core.mlir @@ -35,28 +35,35 @@ func.func @kernel(%t0: tensor<1x32x40x128xf32>, %t1: tensor<1x1x40x128xf32>, %t2 %t_start = call @rtclock() : () -> f64 // end of RoPE, begin of Softmax(QK/sqrt(d_k)): - %88 = tosa.mul %t0, %t1 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32> + %shift_88 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %88 = tosa.mul %t0, %t1, %shift_88 : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>, tensor<1xi8>) -> tensor<1x32x40x128xf32> %89 = tosa.add %t2, %88 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32> - %90 = "tosa.const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi32>}> : () -> tensor<4xi32> - %91 = tosa.transpose %89, %90 : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x32x128x40xf32> - %92 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32> + %91 = tosa.transpose %89 {perms = array} : (tensor<1x32x40x128xf32>) -> tensor<1x32x128x40xf32> + %92 = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x32x40x128xf32>}> : () -> tensor<1x32x40x128xf32> %93 = tosa.add %t3, %92 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32> - %94 = tosa.reshape %93 {new_shape = array} : (tensor<1x32x40x128xf32>) -> tensor<32x40x128xf32> - %95 = "tosa.const"() <{value = dense<0.000000e+00> : tensor<1x32x128x40xf32>}> : () -> tensor<1x32x128x40xf32> + %shape_94 = tosa.const_shape {values = dense<[32, 40, 128]> : tensor<3xindex>} : () -> !tosa.shape<3> + %94 = tosa.reshape %93, %shape_94 : (tensor<1x32x40x128xf32>, !tosa.shape<3>) -> tensor<32x40x128xf32> + %95 = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1x32x128x40xf32>}> : () -> tensor<1x32x128x40xf32> %96 = tosa.add %91, %95 : (tensor<1x32x128x40xf32>, tensor<1x32x128x40xf32>) -> tensor<1x32x128x40xf32> - %97 = tosa.reshape %96 {new_shape = array} : (tensor<1x32x128x40xf32>) -> tensor<32x128x40xf32> - %98 = tosa.matmul %94, %97 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>) -> tensor<32x40x40xf32> - %99 = tosa.reshape %98 {new_shape = array} : (tensor<32x40x40xf32>) -> tensor<1x32x40x40xf32> - %100 = "tosa.const"() <{value = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32> + %shape_97 = tosa.const_shape {values = dense<[32, 128, 40]> : tensor<3xindex>} : () -> !tosa.shape<3> + %97 = tosa.reshape %96, %shape_97 : (tensor<1x32x128x40xf32>, !tosa.shape<3>) -> tensor<32x128x40xf32> + %zp1_98 = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> + %zp2_98 = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32> + %98 = tosa.matmul %94, %97, %zp1_98, %zp2_98 : (tensor<32x40x128xf32>, tensor<32x128x40xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<32x40x40xf32> + %shape_99 = tosa.const_shape {values = dense<[1, 32, 40, 40]> : tensor<4xindex>} : () -> !tosa.shape<4> + %99 = tosa.reshape %98, %shape_99 : (tensor<32x40x40xf32>, !tosa.shape<4>) -> tensor<1x32x40x40xf32> + %100 = "tosa.const"() <{values = dense<11.3137083> : tensor<1x32x40x40xf32>}> : () -> tensor<1x32x40x40xf32> %101 = tosa.reciprocal %100 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> - %102 = tosa.mul %99, %101 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> + %shift_102 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %102 = tosa.mul %99, %101, %shift_102 : (tensor<1x32x40x40xf32>, tensor<1x32x40x40xf32>, tensor<1xi8>) -> tensor<1x32x40x40xf32> %103 = tosa.add %102, %t4 : (tensor<1x32x40x40xf32>, tensor<1x1x40x40xf32>) -> tensor<1x32x40x40xf32> %104 = tosa.reduce_max %103 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32> %105 = tosa.sub %103, %104 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32> %106 = tosa.exp %105 : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x40xf32> %107 = tosa.reduce_sum %106 {axis = 3 : i32} : (tensor<1x32x40x40xf32>) -> tensor<1x32x40x1xf32> %108 = tosa.reciprocal %107 : (tensor<1x32x40x1xf32>) -> tensor<1x32x40x1xf32> - %109 = tosa.mul %106, %108 {shift = 0 : i8} : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>) -> tensor<1x32x40x40xf32> + %shift_109 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %109 = tosa.mul %106, %108, %shift_109 : (tensor<1x32x40x40xf32>, tensor<1x32x40x1xf32>, tensor<1xi8>) -> tensor<1x32x40x40xf32> %t_end = call @rtclock() : () -> f64 %time = arith.subf %t_end, %t_start : f64 diff --git a/examples/BuddyNext/next-mhsa-qkv-fusion.mlir b/examples/BuddyNext/next-mhsa-qkv-fusion.mlir index 5bfa25c492..ad1f6c60a4 100644 --- a/examples/BuddyNext/next-mhsa-qkv-fusion.mlir +++ b/examples/BuddyNext/next-mhsa-qkv-fusion.mlir @@ -34,20 +34,26 @@ func.func private @printMemrefF32(%ptr : tensor<*xf32>) func.func @kernel(%t0: tensor<1x40x4096xf32>, %t1: tensor<4096x4096xf32>, %t2: tensor<4096x4096xf32>, %t3: tensor<4096x4096xf32>) { %t_start = call @rtclock() : () -> f64 - %42 = tosa.reshape %t0 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %shape_42 = tosa.const_shape {values = dense<[40, 4096]> : tensor<2xindex>} : () -> !tosa.shape<2> + %42 = tosa.reshape %t0, %shape_42 : (tensor<1x40x4096xf32>, !tosa.shape<2>) -> tensor<40x4096xf32> %cst_6 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32> %43 = linalg.matmul_transpose_b {cast = #linalg.type_fn} ins(%42, %t1 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_6 : tensor<40x4096xf32>) -> tensor<40x4096xf32> - %44 = tosa.reshape %43 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + %shape_44 = tosa.const_shape {values = dense<[1, 40, 4096]> : tensor<3xindex>} : () -> !tosa.shape<3> + %44 = tosa.reshape %43, %shape_44 : (tensor<40x4096xf32>, !tosa.shape<3>) -> tensor<1x40x4096xf32> - %45 = tosa.reshape %t0 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %shape_45 = tosa.const_shape {values = dense<[40, 4096]> : tensor<2xindex>} : () -> !tosa.shape<2> + %45 = tosa.reshape %t0, %shape_45 : (tensor<1x40x4096xf32>, !tosa.shape<2>) -> tensor<40x4096xf32> %cst_7 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32> %46 = linalg.matmul_transpose_b {cast = #linalg.type_fn} ins(%45, %t2 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_7 : tensor<40x4096xf32>) -> tensor<40x4096xf32> - %47 = tosa.reshape %46 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + %shape_47 = tosa.const_shape {values = dense<[1, 40, 4096]> : tensor<3xindex>} : () -> !tosa.shape<3> + %47 = tosa.reshape %46, %shape_47 : (tensor<40x4096xf32>, !tosa.shape<3>) -> tensor<1x40x4096xf32> - %48 = tosa.reshape %t0 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %shape_48 = tosa.const_shape {values = dense<[40, 4096]> : tensor<2xindex>} : () -> !tosa.shape<2> + %48 = tosa.reshape %t0, %shape_48 : (tensor<1x40x4096xf32>, !tosa.shape<2>) -> tensor<40x4096xf32> %cst_8 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32> %49 = linalg.matmul_transpose_b {cast = #linalg.type_fn} ins(%48, %t3 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_8 : tensor<40x4096xf32>) -> tensor<40x4096xf32> - %50 = tosa.reshape %49 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + %shape_50 = tosa.const_shape {values = dense<[1, 40, 4096]> : tensor<3xindex>} : () -> !tosa.shape<3> + %50 = tosa.reshape %49, %shape_50 : (tensor<40x4096xf32>, !tosa.shape<3>) -> tensor<1x40x4096xf32> %t_end = call @rtclock() : () -> f64 %time = arith.subf %t_end, %t_start : f64 diff --git a/examples/BuddyNext/next-mhsa-qkv.mlir b/examples/BuddyNext/next-mhsa-qkv.mlir index 3cafcbdcb8..2eb9c8c54d 100644 --- a/examples/BuddyNext/next-mhsa-qkv.mlir +++ b/examples/BuddyNext/next-mhsa-qkv.mlir @@ -34,26 +34,32 @@ func.func private @printMemrefF32(%ptr : tensor<*xf32>) func.func @kernel(%t0: tensor<1x40x4096xf32>, %t1: tensor<4096x4096xf32>, %t2: tensor<4096x4096xf32>, %t3: tensor<4096x4096xf32>) { %t_start = call @rtclock() : () -> f64 - %42 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> - %43 = tosa.transpose %t1, %42 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> - %44 = tosa.reshape %t0 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %42 = "tosa.const"() <{values = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %43 = tosa.transpose %t1 {perms = array} : (tensor<4096x4096xf32>) -> tensor<4096x4096xf32> + %shape_44 = tosa.const_shape {values = dense<[40, 4096]> : tensor<2xindex>} : () -> !tosa.shape<2> + %44 = tosa.reshape %t0, %shape_44 : (tensor<1x40x4096xf32>, !tosa.shape<2>) -> tensor<40x4096xf32> %cst_6 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32> %45 = linalg.matmul {cast = #linalg.type_fn} ins(%44, %43 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_6 : tensor<40x4096xf32>) -> tensor<40x4096xf32> - %46 = tosa.reshape %45 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + %shape_46 = tosa.const_shape {values = dense<[1, 40, 4096]> : tensor<3xindex>} : () -> !tosa.shape<3> + %46 = tosa.reshape %45, %shape_46 : (tensor<40x4096xf32>, !tosa.shape<3>) -> tensor<1x40x4096xf32> - %47 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> - %48 = tosa.transpose %t2, %47 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> - %49 = tosa.reshape %t0 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %47 = "tosa.const"() <{values = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %48 = tosa.transpose %t2 {perms = array} : (tensor<4096x4096xf32>) -> tensor<4096x4096xf32> + %shape_49 = tosa.const_shape {values = dense<[40, 4096]> : tensor<2xindex>} : () -> !tosa.shape<2> + %49 = tosa.reshape %t0, %shape_49 : (tensor<1x40x4096xf32>, !tosa.shape<2>) -> tensor<40x4096xf32> %cst_7 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32> %50 = linalg.matmul {cast = #linalg.type_fn} ins(%49, %48 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_7 : tensor<40x4096xf32>) -> tensor<40x4096xf32> - %51 = tosa.reshape %50 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + %shape_51 = tosa.const_shape {values = dense<[1, 40, 4096]> : tensor<3xindex>} : () -> !tosa.shape<3> + %51 = tosa.reshape %50, %shape_51 : (tensor<40x4096xf32>, !tosa.shape<3>) -> tensor<1x40x4096xf32> - %52 = "tosa.const"() <{value = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> - %53 = tosa.transpose %t3, %52 : (tensor<4096x4096xf32>, tensor<2xi32>) -> tensor<4096x4096xf32> - %54 = tosa.reshape %t0 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<40x4096xf32> + %52 = "tosa.const"() <{values = dense<[1, 0]> : tensor<2xi32>}> : () -> tensor<2xi32> + %53 = tosa.transpose %t3 {perms = array} : (tensor<4096x4096xf32>) -> tensor<4096x4096xf32> + %shape_54 = tosa.const_shape {values = dense<[40, 4096]> : tensor<2xindex>} : () -> !tosa.shape<2> + %54 = tosa.reshape %t0, %shape_54 : (tensor<1x40x4096xf32>, !tosa.shape<2>) -> tensor<40x4096xf32> %cst_8 = arith.constant dense<0.000000e+00> : tensor<40x4096xf32> %55 = linalg.matmul {cast = #linalg.type_fn} ins(%54, %53 : tensor<40x4096xf32>, tensor<4096x4096xf32>) outs(%cst_8 : tensor<40x4096xf32>) -> tensor<40x4096xf32> - %56 = tosa.reshape %55 {new_shape = array} : (tensor<40x4096xf32>) -> tensor<1x40x4096xf32> + %shape_56 = tosa.const_shape {values = dense<[1, 40, 4096]> : tensor<3xindex>} : () -> !tosa.shape<3> + %56 = tosa.reshape %55, %shape_56 : (tensor<40x4096xf32>, !tosa.shape<3>) -> tensor<1x40x4096xf32> %t_end = call @rtclock() : () -> f64 %time = arith.subf %t_end, %t_start : f64 diff --git a/examples/BuddyNext/next-norm.mlir b/examples/BuddyNext/next-norm.mlir index ee1e8c3041..1f5c584efb 100644 --- a/examples/BuddyNext/next-norm.mlir +++ b/examples/BuddyNext/next-norm.mlir @@ -52,16 +52,21 @@ func.func @kernel(%t0: tensor<1x40x4096xf32>, %t1: tensor<4096xf32>) { linalg.yield %4175 : f32 } -> tensor<1x40x4096xf32> %32 = tosa.reduce_sum %31 {axis = 2 : i32} : (tensor<1x40x4096xf32>) -> tensor<1x40x1xf32> - %33 = "tosa.const"() <{value = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32> + %33 = "tosa.const"() <{values = dense<4.096000e+03> : tensor<1xf32>}> : () -> tensor<1xf32> %34 = tosa.reciprocal %33 : (tensor<1xf32>) -> tensor<1xf32> - %temp = tosa.reshape %34 {new_shape = array} : (tensor<1xf32>) -> tensor<1x1x1xf32> - %35 = tosa.mul %temp, %32 {shift = 0 : i8} : (tensor<1x1x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32> - %36 = "tosa.const"() <{value = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32> + %shape_temp = tosa.const_shape {values = dense<[1, 1, 1]> : tensor<3xindex>} : () -> !tosa.shape<3> + %temp = tosa.reshape %34, %shape_temp : (tensor<1xf32>, !tosa.shape<3>) -> tensor<1x1x1xf32> + %shift_35 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %35 = tosa.mul %temp, %32, %shift_35 : (tensor<1x1x1xf32>, tensor<1x40x1xf32>, tensor<1xi8>) -> tensor<1x40x1xf32> + %36 = "tosa.const"() <{values = dense<9.99999974E-6> : tensor<1x40x1xf32>}> : () -> tensor<1x40x1xf32> %37 = tosa.add %35, %36 : (tensor<1x40x1xf32>, tensor<1x40x1xf32>) -> tensor<1x40x1xf32> %38 = tosa.rsqrt %37 : (tensor<1x40x1xf32>) -> tensor<1x40x1xf32> - %39 = tosa.mul %t0, %38 {shift = 0 : i8} : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>) -> tensor<1x40x4096xf32> - %40 = tosa.reshape %t1 {new_shape = array} : (tensor<4096xf32>) -> tensor<1x1x4096xf32> - %41 = tosa.mul %40, %39 {shift = 0 : i8} : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>) -> tensor<1x40x4096xf32> + %shift_39 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %39 = tosa.mul %t0, %38, %shift_39 : (tensor<1x40x4096xf32>, tensor<1x40x1xf32>, tensor<1xi8>) -> tensor<1x40x4096xf32> + %shape_40 = tosa.const_shape {values = dense<[1, 1, 4096]> : tensor<3xindex>} : () -> !tosa.shape<3> + %40 = tosa.reshape %t1, %shape_40 : (tensor<4096xf32>, !tosa.shape<3>) -> tensor<1x1x4096xf32> + %shift_41 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %41 = tosa.mul %40, %39, %shift_41 : (tensor<1x1x4096xf32>, tensor<1x40x4096xf32>, tensor<1xi8>) -> tensor<1x40x4096xf32> %t_end = call @rtclock() : () -> f64 %time = arith.subf %t_end, %t_start : f64 diff --git a/examples/BuddyNext/next-rope.mlir b/examples/BuddyNext/next-rope.mlir index e8df59fa8d..dd18e2a996 100644 --- a/examples/BuddyNext/next-rope.mlir +++ b/examples/BuddyNext/next-rope.mlir @@ -6,7 +6,6 @@ // RUN: -empty-tensor-to-alloc-tensor \ // RUN: -one-shot-bufferize="bufferize-function-boundaries" \ // RUN: -convert-linalg-to-affine-loops \ -// RUN: -affine-loop-fusion \ // RUN: -affine-parallelize \ // RUN: -convert-vector-to-scf \ // RUN: -expand-strided-metadata \ @@ -43,17 +42,17 @@ func.func private @rtclock() -> f64 func.func @kernel(%arg0 : tensor<1x40x4096xf32>, %arg1 : tensor<1x40x4096xf32>, %arg2 : tensor<1x40x4096xf32>, %arg3 : tensor<1x1x2048x128xf32>, %arg4 : tensor<1x1x2048x128xf32>, %arg5 : tensor<1x40xi64>) { %t_start = call @rtclock() : () -> f64 - %57 = tosa.reshape %arg0 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32> - %58 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32> - %59 = tosa.transpose %57, %58 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32> + %1 = tosa.const_shape {values = dense<[1, 40, 32, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %57 = tosa.reshape %arg0, %1 : (tensor<1x40x4096xf32>, !tosa.shape<4>) -> tensor<1x40x32x128xf32> + %59 = tosa.transpose %57 {perms = array} : (tensor<1x40x32x128xf32>) -> tensor<1x32x40x128xf32> - %60 = tosa.reshape %arg1 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32> - %61 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32> - %62 = tosa.transpose %60, %61 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32> + %4 = tosa.const_shape {values = dense<[1, 40, 32, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %60 = tosa.reshape %arg1, %4 : (tensor<1x40x4096xf32>, !tosa.shape<4>) -> tensor<1x40x32x128xf32> + %62 = tosa.transpose %60 {perms = array} : (tensor<1x40x32x128xf32>) -> tensor<1x32x40x128xf32> - %63 = tosa.reshape %arg2 {new_shape = array} : (tensor<1x40x4096xf32>) -> tensor<1x40x32x128xf32> - %64 = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32> - %65 = tosa.transpose %63, %64 : (tensor<1x40x32x128xf32>, tensor<4xi32>) -> tensor<1x32x40x128xf32> + %7 = tosa.const_shape {values = dense<[1, 40, 32, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %63 = tosa.reshape %arg2, %7 : (tensor<1x40x4096xf32>, !tosa.shape<4>) -> tensor<1x40x32x128xf32> + %65 = tosa.transpose %63 {perms = array} : (tensor<1x40x32x128xf32>) -> tensor<1x32x40x128xf32> %extracted_slice_9 = tensor.extract_slice %arg3[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32> %extracted_slice_10 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32> @@ -90,7 +89,9 @@ func.func @kernel(%arg0 : tensor<1x40x4096xf32>, %arg1 : tensor<1x40x4096xf32>, %extracted = tensor.extract %69[%4175, %4176] : tensor<40x128xf32> linalg.yield %extracted : f32 } -> tensor<1x40x128xf32> - %76 = tosa.reshape %75 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32> + + %20 = tosa.const_shape {values = dense<[1, 1, 40, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %76 = tosa.reshape %75, %20 : (tensor<1x40x128xf32>, !tosa.shape<4>) -> tensor<1x1x40x128xf32> %77 = tensor.empty() : tensor<1x40x128xf32> %78 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg5 : tensor<1x40xi64>) outs(%77 : tensor<1x40x128xf32>) { ^bb0(%in: i64, %out: f32): @@ -99,20 +100,26 @@ func.func @kernel(%arg0 : tensor<1x40x4096xf32>, %arg1 : tensor<1x40x4096xf32>, %extracted = tensor.extract %73[%4175, %4176] : tensor<40x128xf32> linalg.yield %extracted : f32 } -> tensor<1x40x128xf32> - %79 = tosa.reshape %78 {new_shape = array} : (tensor<1x40x128xf32>) -> tensor<1x1x40x128xf32> - %80 = tosa.mul %59, %76 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32> + %24 = tosa.const_shape {values = dense<[1, 1, 40, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %79 = tosa.reshape %78, %24 : (tensor<1x40x128xf32>, !tosa.shape<4>) -> tensor<1x1x40x128xf32> + + %26 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %80 = tosa.mul %59, %76, %26 : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>, tensor<1xi8>) -> tensor<1x32x40x128xf32> %extracted_slice_15 = tensor.extract_slice %59[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> %extracted_slice_16 = tensor.extract_slice %59[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> - %81 = tosa.negate %extracted_slice_16 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32> + %37 = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32> + %38 = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32> + %81 = tosa.negate %extracted_slice_16, %37, %38 : (tensor<1x32x40x64xf32>,tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x40x64xf32> %82 = tensor.empty() : tensor<1x32x40x128xf32> %inserted_slice = tensor.insert_slice %81 into %82[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> %inserted_slice_17 = tensor.insert_slice %extracted_slice_15 into %inserted_slice[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> - %83 = tosa.mul %inserted_slice_17, %79 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32> + %32 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %83 = tosa.mul %inserted_slice_17, %79, %32 : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>, tensor<1xi8>) -> tensor<1x32x40x128xf32> %84 = tosa.add %80, %83 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32> - %85 = tosa.mul %62, %76 {shift = 0 : i8} : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) -> tensor<1x32x40x128xf32> + %85 = tosa.mul %62, %76, %32 : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>, tensor<1xi8>) -> tensor<1x32x40x128xf32> %extracted_slice_18 = tensor.extract_slice %62[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> %extracted_slice_19 = tensor.extract_slice %62[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> - %86 = tosa.negate %extracted_slice_19 : (tensor<1x32x40x64xf32>) -> tensor<1x32x40x64xf32> + %86 = tosa.negate %extracted_slice_19, %37, %38 : (tensor<1x32x40x64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x40x64xf32> %87 = tensor.empty() : tensor<1x32x40x128xf32> %inserted_slice_20 = tensor.insert_slice %86 into %87[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> %inserted_slice_21 = tensor.insert_slice %extracted_slice_18 into %inserted_slice_20[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> diff --git a/examples/BuddyNext/next-transpose.mlir b/examples/BuddyNext/next-transpose.mlir index 671cb9e6cc..77b86cd48d 100644 --- a/examples/BuddyNext/next-transpose.mlir +++ b/examples/BuddyNext/next-transpose.mlir @@ -34,8 +34,7 @@ func.func private @printMemrefF32(%ptr : tensor<*xf32>) func.func @kernel(%t0 : tensor<1x32x40x128xf32>) { %t_start = call @rtclock() : () -> f64 - %idx = "tosa.const"() <{value = dense<[0, 2, 1, 3]> : tensor<4xi32>}> : () -> tensor<4xi32> - %t1 = tosa.transpose %t0, %idx : (tensor<1x32x40x128xf32>, tensor<4xi32>) -> tensor<1x40x32x128xf32> + %t1 = tosa.transpose %t0 {perms = array} : (tensor<1x32x40x128xf32>) -> tensor<1x40x32x128xf32> %t_end = call @rtclock() : () -> f64 %time = arith.subf %t_end, %t_start : f64 diff --git a/examples/MLIRLinalg/linalg-transpose-f32.mlir b/examples/MLIRLinalg/linalg-transpose-f32.mlir index bd478f6daa..c2daa75261 100644 --- a/examples/MLIRLinalg/linalg-transpose-f32.mlir +++ b/examples/MLIRLinalg/linalg-transpose-f32.mlir @@ -1,4 +1,4 @@ -// RUN: buddy-opt -transpose-optimize="vector-size=16" -verify-diagnostics -lower-affine -expand-strided-metadata -convert-vector-to-scf -convert-vector-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-cf-to-llvm -convert-arith-to-llvm -convert-func-to-llvm -lower-affine -llvm-request-c-wrappers -convert-arith-to-llvm -reconcile-unrealized-casts %s \ +// RUN: buddy-opt -transpose-optimize="vector-size=16" -verify-diagnostics -lower-affine -expand-strided-metadata -convert-vector-to-scf -convert-vector-to-llvm -convert-ub-to-llvm -finalize-memref-to-llvm -convert-scf-to-cf -convert-cf-to-llvm -convert-arith-to-llvm -convert-func-to-llvm -lower-affine -llvm-request-c-wrappers -convert-arith-to-llvm -reconcile-unrealized-casts %s \ // RUN: | mlir-runner -O0 -e buddy_transpose_f32 -entry-point-result=void \ // RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext,%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ // RUN: | FileCheck %s diff --git a/examples/MLIRTensor/tensor-insert-slice.mlir b/examples/MLIRTensor/tensor-insert-slice.mlir index 30e5ab836f..d8391e056c 100644 --- a/examples/MLIRTensor/tensor-insert-slice.mlir +++ b/examples/MLIRTensor/tensor-insert-slice.mlir @@ -37,21 +37,21 @@ func.func @main() { // TODO: Printed results with errors, currently skipping value test. // CHECK: {{.*}} func.call @printMemrefF32(%print_out1) : (tensor<*xf32>) -> () - %t4 = tensor.insert_slice %t1 into %t0[0, 1, 0][1, 3, 3][1, 1, 1] : + %t4 = tensor.insert_slice %t1 into %t0[0, 0, 0][1, 3, 3][1, 1, 1] : tensor<3x3xf32> into tensor<3x3x3xf32> %print_out2 = tensor.cast %t4 : tensor<3x3x3xf32> to tensor<*xf32> // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [3, 3, 3] strides = [9, 3, 1] data = // TODO: Printed results with errors, currently skipping value test. // CHECK: {{.*}} func.call @printMemrefF32(%print_out2) : (tensor<*xf32>) -> () - %t5 = tensor.insert_slice %t1 into %t0[0, 0, 0][1, 3, 3][1, 2, 1] : + %t5 = tensor.insert_slice %t1 into %t0[0, 0, 0][1, 3, 3][1, 1, 1] : tensor<3x3xf32> into tensor<3x3x3xf32> %print_out3 = tensor.cast %t5 : tensor<3x3x3xf32> to tensor<*xf32> // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [3, 3, 3] strides = [9, 3, 1] data = // TODO: Printed results with errors, currently skipping value test. // CHECK: {{.*}} func.call @printMemrefF32(%print_out3) : (tensor<*xf32>) -> () - %t6 = tensor.insert_slice %t1 into %t0[0, 0, 0][1, 3, 3][1, 1, 3] : + %t6 = tensor.insert_slice %t1 into %t0[0, 0, 0][1, 3, 3][1, 1, 1] : tensor<3x3xf32> into tensor<3x3x3xf32> %print_out4 = tensor.cast %t6 : tensor<3x3x3xf32> to tensor<*xf32> // CHECK: Unranked Memref base@ = {{.*}} rank = 3 offset = 0 sizes = [3, 3, 3] strides = [9, 3, 1] data = diff --git a/examples/MLIRVector/vector-broadcast.mlir b/examples/MLIRVector/vector-broadcast.mlir index 87dc64047c..72ed679207 100644 --- a/examples/MLIRVector/vector-broadcast.mlir +++ b/examples/MLIRVector/vector-broadcast.mlir @@ -1,7 +1,7 @@ // RUN: buddy-opt %s \ // RUN: -convert-vector-to-scf -convert-scf-to-cf \ // RUN: -convert-cf-to-llvm \ -// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ +// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-ub-to-llvm \ // RUN: -convert-arith-to-llvm \ // RUN: -reconcile-unrealized-casts \ // RUN: | mlir-runner -e main -entry-point-result=i32 \ diff --git a/examples/MLIRVector/vector-contract.mlir b/examples/MLIRVector/vector-contract.mlir index 80653421c2..3581b96c41 100644 --- a/examples/MLIRVector/vector-contract.mlir +++ b/examples/MLIRVector/vector-contract.mlir @@ -1,7 +1,7 @@ // RUN: buddy-opt %s \ // RUN: -convert-vector-to-scf -lower-affine -convert-scf-to-cf \ // RUN: -convert-cf-to-llvm \ -// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ +// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-ub-to-llvm \ // RUN: -convert-arith-to-llvm \ // RUN: -reconcile-unrealized-casts \ // RUN: | mlir-runner -e main -entry-point-result=i32 \ diff --git a/examples/MLIRVector/vector-gather.mlir b/examples/MLIRVector/vector-gather.mlir index 900b46c2c7..71123a78d5 100644 --- a/examples/MLIRVector/vector-gather.mlir +++ b/examples/MLIRVector/vector-gather.mlir @@ -1,7 +1,7 @@ // RUN: buddy-opt %s \ // RUN: -convert-vector-to-scf -convert-scf-to-cf \ // RUN: -convert-cf-to-llvm \ -// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ +// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-ub-to-llvm \ // RUN: -convert-arith-to-llvm \ // RUN: -split-input-file -verify-diagnostics \ // RUN: -reconcile-unrealized-casts \ diff --git a/examples/MLIRVector/vector-shuffle.mlir b/examples/MLIRVector/vector-shuffle.mlir index fc46dfda2d..8b1ead46c3 100644 --- a/examples/MLIRVector/vector-shuffle.mlir +++ b/examples/MLIRVector/vector-shuffle.mlir @@ -1,7 +1,7 @@ // RUN: buddy-opt %s \ // RUN: -convert-vector-to-scf -convert-scf-to-cf \ // RUN: -convert-cf-to-llvm \ -// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ +// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-ub-to-llvm \ // RUN: -convert-arith-to-llvm \ // RUN: -split-input-file -verify-diagnostics \ // RUN: -reconcile-unrealized-casts \ diff --git a/examples/MLIRVector/vector-transfer-read.mlir b/examples/MLIRVector/vector-transfer-read.mlir index d428b6dcef..0478b4deb1 100644 --- a/examples/MLIRVector/vector-transfer-read.mlir +++ b/examples/MLIRVector/vector-transfer-read.mlir @@ -1,7 +1,7 @@ // RUN: buddy-opt %s \ // RUN: -convert-vector-to-scf -lower-affine -convert-scf-to-cf \ // RUN: -convert-cf-to-llvm \ -// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ +// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-ub-to-llvm \ // RUN: -convert-arith-to-llvm \ // RUN: -reconcile-unrealized-casts \ // RUN: | mlir-runner -e main -entry-point-result=i32 \ diff --git a/examples/MLIRVector/vector-transfer-write.mlir b/examples/MLIRVector/vector-transfer-write.mlir index bb69bc3a76..6f063e8683 100644 --- a/examples/MLIRVector/vector-transfer-write.mlir +++ b/examples/MLIRVector/vector-transfer-write.mlir @@ -1,7 +1,7 @@ // RUN: buddy-opt %s \ // RUN: -convert-vector-to-scf -lower-affine -convert-scf-to-cf \ // RUN: -convert-cf-to-llvm \ -// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ +// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-ub-to-llvm \ // RUN: -convert-arith-to-llvm \ // RUN: -reconcile-unrealized-casts \ // RUN: | mlir-runner -e main -entry-point-result=i32 \ diff --git a/examples/MLIRVector/vector-transpose.mlir b/examples/MLIRVector/vector-transpose.mlir index ec4d50efb5..7b0be6a6b5 100644 --- a/examples/MLIRVector/vector-transpose.mlir +++ b/examples/MLIRVector/vector-transpose.mlir @@ -1,7 +1,7 @@ // RUN: buddy-opt %s \ // RUN: -convert-vector-to-scf -lower-affine -convert-scf-to-cf \ // RUN: -convert-cf-to-llvm \ -// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ +// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-ub-to-llvm \ // RUN: -convert-arith-to-llvm \ // RUN: -reconcile-unrealized-casts \ // RUN: | mlir-runner -e main -entry-point-result=i32 \ diff --git a/examples/MLIRVector/vector-type-cast.mlir b/examples/MLIRVector/vector-type-cast.mlir index c47f345e91..f7f5639cfa 100644 --- a/examples/MLIRVector/vector-type-cast.mlir +++ b/examples/MLIRVector/vector-type-cast.mlir @@ -1,7 +1,7 @@ // RUN: buddy-opt %s \ // RUN: -convert-vector-to-scf -convert-scf-to-cf \ // RUN: -convert-cf-to-llvm \ -// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm \ +// RUN: -convert-vector-to-llvm -finalize-memref-to-llvm -convert-func-to-llvm -convert-ub-to-llvm \ // RUN: -convert-arith-to-llvm \ // RUN: -split-input-file -verify-diagnostics \ // RUN: -reconcile-unrealized-casts \ diff --git a/llvm b/llvm index 3bd3e06f3f..d8e9216c27 160000 --- a/llvm +++ b/llvm @@ -1 +1 @@ -Subproject commit 3bd3e06f3fe418e24af65457877f40cee0544f9d +Subproject commit d8e9216c27b82b4292e83437d58aebf594adb111 diff --git a/midend/include/Dialect/RVV/RVV.td b/midend/include/Dialect/RVV/RVV.td index 7906c22737..140d539af0 100644 --- a/midend/include/Dialect/RVV/RVV.td +++ b/midend/include/Dialect/RVV/RVV.td @@ -90,7 +90,7 @@ def RVVLoadOp : RVV_Op<"load">, }]; let extraClassDeclaration = [{ mlir::MemRefType getMemRefType() { - return getBase().getType().cast(); + return llvm::cast(getBase().getType()); } }]; let assemblyFormat = "$base `[` $index `]` `,` $length attr-dict `:` " @@ -106,7 +106,7 @@ def RVVStoreOp : RVV_Op<"store">, }]; let extraClassDeclaration = [{ mlir::MemRefType getMemRefType() { - return getBase().getType().cast(); + return llvm::cast(getBase().getType()); } }]; let assemblyFormat = "$value `,` $base `[` $index `]` `,` $length attr-dict " diff --git a/midend/include/Dialect/VectorExp/VectorExpOps.td b/midend/include/Dialect/VectorExp/VectorExpOps.td index e60eeb7fee..95b8e6413c 100644 --- a/midend/include/Dialect/VectorExp/VectorExpOps.td +++ b/midend/include/Dialect/VectorExp/VectorExpOps.td @@ -137,11 +137,11 @@ def VectorExp_LoadOp : VectorExp_Op<"load", [AttrSizedOperandSegments]> { let extraClassDeclaration = [{ mlir::MemRefType getMemRefType() { - return getBase().getType().cast(); + return llvm::cast(getBase().getType()); } mlir::VectorType getVectorType() { - return getResult().getType().cast(); + return llvm::cast(getResult().getType()); } }]; diff --git a/midend/lib/Conversion/ConvOptimization/ConvNhwcFhwcOptimize.cpp b/midend/lib/Conversion/ConvOptimization/ConvNhwcFhwcOptimize.cpp index 81f76f66b2..360810acda 100644 --- a/midend/lib/Conversion/ConvOptimization/ConvNhwcFhwcOptimize.cpp +++ b/midend/lib/Conversion/ConvOptimization/ConvNhwcFhwcOptimize.cpp @@ -86,7 +86,7 @@ class ConvNhwcFhwcOptimizePattern : public ConversionPattern { [convOp.getDilations().getValues().size() - 1]; } - ShapedType inputTy = input.getType().cast(); + ShapedType inputTy = llvm::cast(input.getType()); Type elemTy = inputTy.getElementType(); VectorType vecTy = VectorType::get(vecSize, elemTy); diff --git a/midend/lib/Conversion/ConvOptimization/ConvNhwcFhwcTileOptimize.cpp b/midend/lib/Conversion/ConvOptimization/ConvNhwcFhwcTileOptimize.cpp index 41e1c066ee..446c1172da 100644 --- a/midend/lib/Conversion/ConvOptimization/ConvNhwcFhwcTileOptimize.cpp +++ b/midend/lib/Conversion/ConvOptimization/ConvNhwcFhwcTileOptimize.cpp @@ -92,7 +92,7 @@ class ConvNhwcFhwcTileOptimizePattern : public ConversionPattern { [convOp.getDilations().getValues().size() - 1]; } - ShapedType inputTy = input.getType().cast(); + ShapedType inputTy = llvm::cast(input.getType()); Type elemTy = inputTy.getElementType(); VectorType vecTy = VectorType::get(vecSize, elemTy); diff --git a/midend/lib/Conversion/ConvOptimization/ConvOptimize.cpp b/midend/lib/Conversion/ConvOptimization/ConvOptimize.cpp index 043b66498c..cc7fdeb9be 100644 --- a/midend/lib/Conversion/ConvOptimization/ConvOptimize.cpp +++ b/midend/lib/Conversion/ConvOptimization/ConvOptimize.cpp @@ -57,7 +57,7 @@ class ConvOptimizePattern : public ConversionPattern { Value filter = op->getOperand(1); Value output = op->getOperand(2); - ShapedType inputTy = input.getType().cast(); + ShapedType inputTy = llvm::cast(input.getType()); Type elemTy = inputTy.getElementType(); VectorType vecTy = VectorType::get(vecSize, elemTy); @@ -105,14 +105,14 @@ class ConvOptimizePattern : public ConversionPattern { Value columnInput = builder.create(loc, AffineMap::get(2, 0, d0 + d1 + j * vecSize), ValueRange{ivD, ivG}); Value columnFilter = builder.create(loc, AffineMap::get(1, 0, d0 + j * vecSize), ivG); - Value i = builder.create(loc, vecTy, input, ValueRange{ivA, ivE, rowInput, columnInput}); + Value i = builder.create(loc, vecTy, input, ValueRange{ivA, ivE, rowInput, columnInput}, /*padding=*/std::nullopt); auto protectedF = builder.create(loc, vecTy, IntegerSet::get(1, 1, {s0 - 1 - d0}, {false}), ValueRange{rowFilter, f}, true); // if row in range, read normally. auto thenBuilder = protectedF.getThenBodyBuilder(); - Value normalReadVec = thenBuilder.create(loc, vecTy, filter, ValueRange{ivB, ivE, rowFilter, columnFilter}); + Value normalReadVec = thenBuilder.create(loc, vecTy, filter, ValueRange{ivB, ivE, rowFilter, columnFilter}, /*padding=*/std::nullopt); thenBuilder.create(loc, normalReadVec); // if row out of range, give back a empty vector. diff --git a/midend/lib/Conversion/ConvVectorization/CBConvVectorization.cpp b/midend/lib/Conversion/ConvVectorization/CBConvVectorization.cpp index d15ba87abd..21c0f57955 100644 --- a/midend/lib/Conversion/ConvVectorization/CBConvVectorization.cpp +++ b/midend/lib/Conversion/ConvVectorization/CBConvVectorization.cpp @@ -54,7 +54,7 @@ void populateCBSplitingPattern(Operation *op, int64_t stride, Value c1 = rewriter.create(loc, 1); Value cStride = rewriter.create(loc, stride); Value f0 = rewriter.create( - loc, APFloat::getZero(f32.getFloatSemantics()), f32); + loc, f32, APFloat::getZero(f32.getFloatSemantics())); // Create pass through vector. Value passThroughVec = rewriter.create(loc, vectorTy32, f0); // Get input, kernel and output. @@ -199,7 +199,7 @@ void populateCBTilingPattern(Operation *op, ArrayRef tileSizes, Value kernelCol = rewriter.create(loc, kernel, c1); // Define padding value. Value f0 = rewriter.create( - loc, APFloat::getZero(f32.getFloatSemantics()), f32); + loc, f32, APFloat::getZero(f32.getFloatSemantics())); // Size of strip mining. AffineExpr d0; bindDims(ctx, d0); diff --git a/midend/lib/Conversion/ConvVectorization/GEMMPointwiseConv2DNhwcHwcf.cpp b/midend/lib/Conversion/ConvVectorization/GEMMPointwiseConv2DNhwcHwcf.cpp index 918a1388d6..39573c3f31 100644 --- a/midend/lib/Conversion/ConvVectorization/GEMMPointwiseConv2DNhwcHwcf.cpp +++ b/midend/lib/Conversion/ConvVectorization/GEMMPointwiseConv2DNhwcHwcf.cpp @@ -52,9 +52,9 @@ class GEMMPointwiseConvPattern : public ConversionPattern { Value kernel = op->getOperand(1); Value output = op->getOperand(2); // Get shape of input and output - ShapedType inputShapeType = input.getType().cast(); - ShapedType filterShapeType = kernel.getType().cast(); - ShapedType outputShapeType = output.getType().cast(); + ShapedType inputShapeType = llvm::cast(input.getType()); + ShapedType filterShapeType = llvm::cast(kernel.getType()); + ShapedType outputShapeType = llvm::cast(output.getType()); auto inputShape = inputShapeType.getShape(); auto filterShape = filterShapeType.getShape(); diff --git a/midend/lib/Conversion/ConvVectorization/PoolingNhwcMaxVectorization.cpp b/midend/lib/Conversion/ConvVectorization/PoolingNhwcMaxVectorization.cpp index 280d11b226..c1e871f279 100644 --- a/midend/lib/Conversion/ConvVectorization/PoolingNhwcMaxVectorization.cpp +++ b/midend/lib/Conversion/ConvVectorization/PoolingNhwcMaxVectorization.cpp @@ -115,7 +115,7 @@ class PoolingNhwcMaxVectorizationPattern : public ConversionPattern { Value dilWidth = rewriter.create(loc, dilations[1]); // Get ElementType of input. - Type elementTy = input.getType().cast().getElementType(); + Type elementTy = llvm::cast(input.getType()).getElementType(); VectorType vectorTy = mlir::VectorType::get({strip}, elementTy); // Get Constants. diff --git a/midend/lib/Conversion/ConvVectorization/PoolingVectorization.cpp b/midend/lib/Conversion/ConvVectorization/PoolingVectorization.cpp index 8f38c0bdb9..072759c404 100644 --- a/midend/lib/Conversion/ConvVectorization/PoolingVectorization.cpp +++ b/midend/lib/Conversion/ConvVectorization/PoolingVectorization.cpp @@ -66,7 +66,7 @@ class CBPoolingNhwcSumVectorizationPattern : public ConversionPattern { Value kernel = op->getOperand(1); Value output = op->getOperand(2); // Element type. - MemRefType inputMemRefTy = dyn_cast(input.getType()); + MemRefType inputMemRefTy = llvm::cast(input.getType()); // Element type. FloatType fTy = dyn_cast(inputMemRefTy.getElementType()); // Constants. @@ -170,7 +170,7 @@ class CBPoolingNhwcSumVectorizationPattern : public ConversionPattern { }); // Load into a vector. Value vec = rewriter.create( - loc, vecTy, window, ValueRange{c0}); + loc, vecTy, window, ValueRange{c0}, /*padding=*/std::nullopt); // Reduce vector. Value res = rewriter.create( loc, vector::CombiningKind::ADD, vec); @@ -219,7 +219,7 @@ class CBPoolingNhwcSumVectorizationPattern : public ConversionPattern { }); // Load into a vector. Value vec = rewriter.create( - loc, vecTy, window, ValueRange{c0}); + loc, vecTy, window, ValueRange{c0}, /*padding=*/std::nullopt); // Reduce vector. Value res = rewriter.create( loc, vector::CombiningKind::ADD, vec); diff --git a/midend/lib/Conversion/DAPVectorization/DAPVectorization.cpp b/midend/lib/Conversion/DAPVectorization/DAPVectorization.cpp index e1234d5b64..8946490c50 100644 --- a/midend/lib/Conversion/DAPVectorization/DAPVectorization.cpp +++ b/midend/lib/Conversion/DAPVectorization/DAPVectorization.cpp @@ -219,8 +219,8 @@ class DAPIirVectorization : public OpRewritePattern { Value filterSize = rewriter.create(loc, kernel, c0); FloatType f32 = Float32Type::get(ctx); - Value f0 = rewriter.create(loc, APFloat(0.0f), f32); - Value f1 = rewriter.create(loc, APFloat(1.0f), f32); + Value f0 = rewriter.create(loc, f32, APFloat(0.0f)); + Value f1 = rewriter.create(loc, f32, APFloat(1.0f)); Value cond4 = rewriter.create(loc, CmpIPredicate::ule, filterSize, c4); diff --git a/midend/lib/Conversion/DepthwiseConvOptimization/DepthwiseConvNhwcHwc.cpp b/midend/lib/Conversion/DepthwiseConvOptimization/DepthwiseConvNhwcHwc.cpp index dea3046cab..960860b532 100644 --- a/midend/lib/Conversion/DepthwiseConvOptimization/DepthwiseConvNhwcHwc.cpp +++ b/midend/lib/Conversion/DepthwiseConvOptimization/DepthwiseConvNhwcHwc.cpp @@ -88,7 +88,7 @@ class DepthwiseConv2DNhwcHwcOptimizePattern : public ConversionPattern { [convOp.getDilations().getValues().size() - 1]; } - ShapedType inputTy = input.getType().cast(); + ShapedType inputTy = llvm::cast(input.getType()); Type elemTy = inputTy.getElementType(); VectorType vecTy = VectorType::get(vecSize, elemTy); diff --git a/midend/lib/Conversion/ExtendDAP/ExtendDAPPass.cpp b/midend/lib/Conversion/ExtendDAP/ExtendDAPPass.cpp index 92e76ac5ea..09df5e0629 100644 --- a/midend/lib/Conversion/ExtendDAP/ExtendDAPPass.cpp +++ b/midend/lib/Conversion/ExtendDAP/ExtendDAPPass.cpp @@ -247,18 +247,18 @@ Value initMelFilter(PatternRewriter &rewriter, Location loc, Value c0, Value c1, auto mTp = MemRefType::get(melFilterType.getShape(), melFilterType.getElementType()); Value melFilterMemRef = - rewriter.create(loc, mTp, melFilter); + rewriter.create(loc, mTp, melFilter); // TODO : remove tomemref & totensor, and use insert to replace store. !! Value c391 = rewriter.create(loc, 391); Value number, d1, d2; - // rewriter.create(loc, c0, c391, c1, std::nullopt, + // rewriter.create(loc, c0, c391, c1, ValueRange{}, // [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) { // number = builder.create(loc, melFilterData, iv); // d1 = builder.create(loc, dim1Index, iv); // d2 = builder.create(loc, dim2Index, iv); // builder.create(loc, number, melFilterMemRef, - // ValueRange{d1, d2}); builder.create(loc, std::nullopt); + // ValueRange{d1, d2}); builder.create(loc, ValueRange{}); // }); auto loopOp = rewriter.create(loc, c0, c391, c1); rewriter.setInsertionPointToStart(loopOp.getBody()); @@ -272,8 +272,10 @@ Value initMelFilter(PatternRewriter &rewriter, Location loc, Value c0, Value c1, rewriter.setInsertionPointAfter(loopOp); + auto bufferType = llvm::cast(melFilterMemRef.getType()); + auto tensorType = RankedTensorType::get(bufferType.getShape(), bufferType.getElementType()); Value newMelFilter = rewriter.create( - loc, melFilterMemRef, /*restrict=*/true, /*writable=*/false); + loc, tensorType, melFilterMemRef); return newMelFilter; } @@ -766,12 +768,12 @@ void printMemref(OpBuilder &rewriter, Location loc, Value input, int l) { rewriter.create(loc, "Print Start:\n"); rewriter.create( - loc, c0, length, c1, std::nullopt, + loc, c0, length, c1, ValueRange{}, [&](OpBuilder &b, Location loc, Value i, ValueRange iargs) { Value x = b.create(loc, input, i); b.create(loc, x); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); rewriter.create(loc, "\n"); @@ -941,18 +943,18 @@ void radfgExtend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value idl1 = opBuilder.create(loc, ido, l1); opBuilder.create( - loc, c0, idl1, c1, std::nullopt, + loc, c0, idl1, c1, ValueRange{}, [&](OpBuilder &builder, Location loc, Value ik, ValueRange ik_args) { Value c2ik0 = C2(builder, loc, cc, ik, c0, idl1); CH2w(builder, loc, ch, ik, c0, idl1, c2ik0); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); opBuilder.create( - loc, c1, ipph, c1, std::nullopt, + loc, c1, ipph, c1, ValueRange{}, [&](OpBuilder &builder, Location loc, Value j, ValueRange j_args) { builder.create( - loc, c0, idl1, c1, std::nullopt, + loc, c0, idl1, c1, ValueRange{}, [&](OpBuilder &b, Location loc, Value ik, ValueRange ik_args) { Value c2ikj = C2(b, loc, cc, ik, j, idl1); Value ch2ik0 = CH2(b, loc, ch, ik, c0, idl1); @@ -960,23 +962,23 @@ void radfgExtend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, b.create(loc, ch2ik0, c2ikj); CH2w(b, loc, ch, ik, c0, idl1, ch2ik0_updated); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); opBuilder.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &builder, Location loc, Value k, ValueRange k_args) { builder.create( - loc, c0, ido, c1, std::nullopt, + loc, c0, ido, c1, ValueRange{}, [&](OpBuilder &b, Location loc, Value i, ValueRange i_args) { Value chik0 = CH_radfg(b, loc, ch, i, k, c0, ido, l1); CCw(b, loc, cc, i, c0, k, ido, cdim, chik0); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); Value j_start_0 = opBuilder.create(loc, 1); @@ -994,7 +996,7 @@ void radfgExtend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value j2p1 = builder.create(loc, j2, c1); builder.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &b, Location loc, Value k, ValueRange k_args) { Value ch0kj = CH_radfg(b, loc, ch, c0, k, j, ido, l1); CCw(b, loc, cc, idom1, j2, k, ido, cdim, ch0kj); @@ -1002,7 +1004,7 @@ void radfgExtend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value ch0kjc = CH_radfg(b, loc, ch, c0, k, jc, ido, l1); CCw(b, loc, cc, c0, j2p1, k, ido, cdim, ch0kjc); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); Value j_next = builder.create(loc, j, c1); @@ -1030,7 +1032,7 @@ void radfgExtend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value j2p1 = b.create(loc, j2, c1); b.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &b2, Location loc, Value k, ValueRange k_args) { Value i_start_0 = b2.create(loc, 1); Value ic_start_0 = b2.create(loc, ido, c3); @@ -1071,14 +1073,14 @@ void radfgExtend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, b3.create( loc, std::vector{i_next, ic_next}); }); - b2.create(loc, std::nullopt); + b2.create(loc, ValueRange{}); }); Value j_next = b.create(loc, j, c1); Value jc_next = b.create(loc, jc, c1); b.create(loc, std::vector{j_next, jc_next}); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); return; @@ -1126,7 +1128,7 @@ void radfg(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, Value is2 = b.create(loc, jcm1, idom1); b.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &b2, Location loc, Value k, ValueRange k_args) { Value idij_start = b2.create(loc, is, c0); Value idij2_start = b2.create(loc, is2, c0); @@ -1204,7 +1206,7 @@ void radfg(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, b3.create( loc, std::vector{idij_next, idij2_next}); }); - b2.create(loc, std::nullopt); + b2.create(loc, ValueRange{}); } ); @@ -1213,7 +1215,7 @@ void radfg(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, b.create(loc, jc_next); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); Value jc_a_start = opBuilder.create(loc, ip, c1); @@ -1224,7 +1226,7 @@ void radfg(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, Value jc_a = j_a_args[0]; builder.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &b, Location loc, Value k_a, ValueRange k_a_args) { Value t1_a = C1(b, loc, cc, c0, k_a, j_a, ido, l1); Value t2_a = C1(b, loc, cc, c0, k_a, jc_a, ido, l1); @@ -1234,7 +1236,7 @@ void radfg(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, C1w(b, loc, cc, c0, k_a, j_a, ido, l1, tmp_a); C1w(b, loc, cc, c0, k_a, jc_a, ido, l1, tmp1_a); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); Value jc_a_next = builder.create(loc, jc_a, c1); @@ -1249,7 +1251,7 @@ void radfg(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, Value lc_b = l_b_args[0]; builder.create( - loc, c0, idl1, c1, std::nullopt, + loc, c0, idl1, c1, ValueRange{}, [&](OpBuilder &b, Location loc, Value ik_b, ValueRange ik_b_args) { Value m2l = b.create(loc, l_b, c2); Value m4l = b.create(loc, l_b, c4); @@ -1280,7 +1282,7 @@ void radfg(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, Value tmp6_b = b.create(loc, tmp4_b, tmp5_b); CH2w(b, loc, ch, ik_b, lc_b, idl1, tmp6_b); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); Value iang_start_c = builder.create(loc, c2, l_b); @@ -1315,7 +1317,7 @@ void radfg(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, Value ai4 = AI(b, loc, csarr, iang_4_c); b.create( - loc, c0, idl1, c1, std::nullopt, + loc, c0, idl1, c1, ValueRange{}, [&](OpBuilder &b2, Location loc, Value ik_c, ValueRange ik_c_args) { Value jp1 = b2.create(loc, j, c1); @@ -1376,7 +1378,7 @@ void radfg(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, b2.create(loc, tmp_ai7, ch2iklc); CH2w(b2, loc, ch, ik_c, lc_b, idl1, tmp_ai8); - b2.create(loc, std::nullopt); + b2.create(loc, ValueRange{}); }); Value j_next = b.create(loc, j, c4); @@ -1406,7 +1408,7 @@ void radfg(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, Value ai2 = AI(b, loc, csarr, iang_2_d); b.create( - loc, c0, idl1, c1, std::nullopt, + loc, c0, idl1, c1, ValueRange{}, [&](OpBuilder &b2, Location loc, Value ik_d, ValueRange ik_d_args) { Value jp1 = b2.create(loc, j, c1); @@ -1440,7 +1442,7 @@ void radfg(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, b2.create(loc, tmp_ai3, ch2iklc); CH2w(b2, loc, ch, ik_d, lc_b, idl1, tmp_ai4); - b2.create(loc, std::nullopt); + b2.create(loc, ValueRange{}); }); Value j_next = b.create(loc, j, c2); @@ -1466,7 +1468,7 @@ void radfg(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, Value ai = AI(b, loc, csarr, iang_1_e); b.create( - loc, c0, idl1, c1, std::nullopt, + loc, c0, idl1, c1, ValueRange{}, [&](OpBuilder &b2, Location loc, Value ik_e, ValueRange ik_e_args) { Value c2ikj = C2(b2, loc, cc, ik_e, j, idl1); @@ -1482,7 +1484,7 @@ void radfg(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, b2.create(loc, tmp_ai, ch2iklc); CH2w(b2, loc, ch, ik_e, lc_b, idl1, tmp2_ai); - b2.create(loc, std::nullopt); + b2.create(loc, ValueRange{}); }); Value j_next = b.create(loc, j, c2); @@ -1514,10 +1516,10 @@ void radf2Extend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, // Value idom1 = opBuilder.create(loc, ido, c1); opBuilder.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &builder, Location loc, Value k, ValueRange k_args) { builder.create( - loc, c2, ido, c2, std::nullopt, + loc, c2, ido, c2, ValueRange{}, [&](OpBuilder &b, Location loc, Value i, ValueRange i_args) { Value ic = b.create(loc, ido, i); Value icm1 = b.create(loc, ic, c1); @@ -1541,9 +1543,9 @@ void radf2Extend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, CH(b, loc, ch, i, c0, k, ido, cdim, ti2_ccik0[0]); CH(b, loc, ch, ic, c1, k, ido, cdim, ti2_ccik0[1]); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); } @@ -1565,14 +1567,14 @@ void radf2(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, Value idom1 = opBuilder.create(loc, ido, c1); opBuilder.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &builder, Location loc, Value iv, ValueRange iv_args) { Value cc0k0 = CC(builder, loc, cc, c0, iv, c0, ido, l1); Value cc0k1 = CC(builder, loc, cc, c0, iv, c1, ido, l1); std::vector cc0k0_cc0k1 = PM(builder, loc, cc0k0, cc0k1); CH(builder, loc, ch, c0, c0, iv, ido, cdim, cc0k0_cc0k1[0]); CH(builder, loc, ch, idom1, c1, iv, ido, cdim, cc0k0_cc0k1[1]); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); Value flag = opBuilder.create(loc, ido, c2); @@ -1582,16 +1584,16 @@ void radf2(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, opBuilder.create( loc, condition, [&](OpBuilder &builder, Location loc) { builder.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &b, Location loc, Value k, ValueRange k_args) { Value ccidom1k1 = CC(b, loc, cc, idom1, k, c1, ido, l1); Value tmp = b.create(loc, ccidom1k1); CH(b, loc, ch, c0, c1, k, ido, cdim, tmp); Value ccidom1k0 = CC(b, loc, cc, idom1, k, c0, ido, l1); CH(b, loc, ch, idom1, c0, k, ido, cdim, ccidom1k0); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); Value condition1 = @@ -1599,7 +1601,7 @@ void radf2(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, opBuilder.create( loc, condition1, [&](OpBuilder &builder, Location loc) { radf2Extend(builder, loc, cc, ch, wa, ido, l1, cdim); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); } @@ -1608,9 +1610,9 @@ void radf3Extend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, FloatType f64Ty = opBuilder.getF64Type(); Value taur = - opBuilder.create(loc, APFloat(double(-0.5)), f64Ty); + opBuilder.create(loc, f64Ty, APFloat(double(-0.5))); Value taui = opBuilder.create( - loc, APFloat(double(0.86602540378443864676)), f64Ty); + loc, f64Ty, APFloat(double(0.86602540378443864676))); Value c0 = opBuilder.create(loc, 0); Value c1 = opBuilder.create(loc, 1); @@ -1620,10 +1622,10 @@ void radf3Extend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, // Value c4 = opBuilder.create(loc, 4); opBuilder.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &builder, Location loc, Value k, ValueRange k_args) { builder.create( - loc, c2, ido, c2, std::nullopt, + loc, c2, ido, c2, ValueRange{}, [&](OpBuilder &b, Location loc, Value i, ValueRange i_args) { Value ic = b.create(loc, ido, i); Value icm1 = b.create(loc, ic, c1); @@ -1677,9 +1679,9 @@ void radf3Extend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, CH(builder, loc, ch, i, c2, k, ido, cdim, ti3_ti2[0]); CH(builder, loc, ch, ic, c1, k, ido, cdim, ti3_ti2[1]); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); } @@ -1690,9 +1692,9 @@ void radf3(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, FloatType f64Ty = opBuilder.getF64Type(); Value cdim = opBuilder.create(loc, 3); Value taur = - opBuilder.create(loc, APFloat(double(-0.5)), f64Ty); + opBuilder.create(loc, f64Ty, APFloat(double(-0.5))); Value taui = opBuilder.create( - loc, APFloat(double(0.86602540378443864676)), f64Ty); + loc, f64Ty, APFloat(double(0.86602540378443864676))); Value c0 = opBuilder.create(loc, 0); Value c1 = opBuilder.create(loc, 1); @@ -1704,7 +1706,7 @@ void radf3(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, Value idom1 = opBuilder.create(loc, ido, c1); opBuilder.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &builder, Location loc, Value iv, ValueRange iv_args) { Value cc0k1 = CC(builder, loc, cc, c0, iv, c1, ido, l1); Value cc0k2 = CC(builder, loc, cc, c0, iv, c2, ido, l1); @@ -1722,7 +1724,7 @@ void radf3(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, Value tmp4 = builder.create(loc, tmp3, cc0k0); CH(builder, loc, ch, idom1, c1, iv, ido, cdim, tmp4); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); Value condition = @@ -1730,7 +1732,7 @@ void radf3(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, opBuilder.create( loc, condition, [&](OpBuilder &builder, Location loc) { radf3Extend(builder, loc, cc, ch, wa, ido, l1, cdim); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); } @@ -1738,10 +1740,10 @@ void radf4Extend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, Value ido, Value l1, Value cdim, Value c0, Value c1, Value c2, Value c3) { opBuilder.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &builder, Location loc, Value k, ValueRange kargs) { builder.create( - loc, c2, ido, c2, std::nullopt, + loc, c2, ido, c2, ValueRange{}, [&](OpBuilder &b, Location loc, Value i, ValueRange iargs) { Value ic = b.create(loc, ido, i); Value icm1 = b.create(loc, ic, c1); @@ -1792,10 +1794,10 @@ void radf4Extend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, CH(b, loc, ch, i, c2, k, ido, cdim, chtmp3[0]); CH(b, loc, ch, ic, c1, k, ido, cdim, chtmp3[1]); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); return; @@ -1807,11 +1809,11 @@ void radf4(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, FloatType f64Ty = opBuilder.getF64Type(); Value cdim = opBuilder.create(loc, 4); Value hsqt2 = opBuilder.create( - loc, APFloat(double(0.70710678118654752440)), f64Ty); + loc, f64Ty, APFloat(double(0.70710678118654752440))); Value idom1 = opBuilder.create(loc, ido, c1); opBuilder.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &builder, Location loc, Value iv, ValueRange iargs) { Value cc0k3 = CC(builder, loc, cc, c0, iv, c3, ido, l1); Value cc0k1 = CC(builder, loc, cc, c0, iv, c1, ido, l1); @@ -1828,7 +1830,7 @@ void radf4(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, CH(builder, loc, ch, c0, c0, iv, ido, cdim, tmp2_tmp3[0]); CH(builder, loc, ch, idom1, c3, iv, ido, cdim, tmp2_tmp3[1]); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); Value reminder = opBuilder.create(loc, ido, c2); @@ -1837,10 +1839,10 @@ void radf4(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, opBuilder.create( loc, condition0, [&](OpBuilder &builder, Location loc) { Value negHsqt2 = builder.create( - loc, APFloat(double(-0.70710678118654752440)), f64Ty); + loc, f64Ty, APFloat(double(-0.70710678118654752440))); builder.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &b, Location loc, Value iv, ValueRange iargs) { Value ccidom1k1 = CC(b, loc, cc, idom1, iv, c1, ido, l1); Value ccidom1k3 = CC(b, loc, cc, idom1, iv, c3, ido, l1); @@ -1860,10 +1862,10 @@ void radf4(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, CH(b, loc, ch, c0, c3, iv, ido, cdim, tmp4_tmp5[0]); CH(b, loc, ch, c0, c1, iv, ido, cdim, tmp4_tmp5[1]); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); Value condition1 = @@ -1871,7 +1873,7 @@ void radf4(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value wa, opBuilder.create( loc, condition1, [&](OpBuilder &builder, Location loc) { radf4Extend(builder, loc, cc, ch, wa, ido, l1, cdim, c0, c1, c2, c3); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); return; @@ -1882,10 +1884,10 @@ void radf5Extend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, Value tr12, Value ti11, Value ti12, Value c0, Value c1, Value c2, Value c3, Value c4) { opBuilder.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &builder, Location loc, Value k, ValueRange kargs) { builder.create( - loc, c2, ido, c2, std::nullopt, + loc, c2, ido, c2, ValueRange{}, [&](OpBuilder &b, Location loc, Value i, ValueRange iargs) { Value ic = b.create(loc, ido, i); Value icm1 = b.create(loc, ic, c1); @@ -1976,10 +1978,10 @@ void radf5Extend(OpBuilder &opBuilder, Location loc, Value cc, Value ch, CH(b, loc, ch, i, c4, k, ido, cdim, chtmp3[0]); CH(b, loc, ch, ic, c3, k, ido, cdim, chtmp3[1]); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); return; @@ -1993,17 +1995,17 @@ void radf5(OpBuilder &builder, Location loc, Value cc, Value ch, Value wa, FloatType f64Ty = builder.getF64Type(); Value cdim = builder.create(loc, 5); Value tr11 = builder.create( - loc, APFloat(double(0.3090169943749474241)), f64Ty); + loc, f64Ty, APFloat(double(0.3090169943749474241))); Value tr12 = builder.create( - loc, APFloat(double(-0.8090169943749474241)), f64Ty); + loc, f64Ty, APFloat(double(-0.8090169943749474241))); Value ti11 = builder.create( - loc, APFloat(double(0.95105651629515357212)), f64Ty); + loc, f64Ty, APFloat(double(0.95105651629515357212))); Value ti12 = builder.create( - loc, APFloat(double(0.58778525229247312917)), f64Ty); + loc, f64Ty, APFloat(double(0.58778525229247312917))); Value idom1 = builder.create(loc, ido, c1); builder.create( - loc, c0, l1, c1, std::nullopt, + loc, c0, l1, c1, ValueRange{}, [&](OpBuilder &b, Location loc, Value iv, ValueRange iargs) { Value cc0k4 = CC(b, loc, cc, c0, iv, c4, ido, l1); Value cc0k1 = CC(b, loc, cc, c0, iv, c1, ido, l1); @@ -2040,7 +2042,7 @@ void radf5(OpBuilder &builder, Location loc, Value cc, Value ch, Value wa, Value ch4 = b.create(loc, tmpch9, tmpch10); CH(b, loc, ch, c0, c4, iv, ido, cdim, ch4); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); Value condition = @@ -2048,7 +2050,7 @@ void radf5(OpBuilder &builder, Location loc, Value cc, Value ch, Value wa, builder.create(loc, condition, [&](OpBuilder &b, Location loc) { radf5Extend(b, loc, cc, ch, wa, ido, l1, cdim, tr11, tr12, ti11, ti12, c0, c1, c2, c3, c4); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); return; @@ -2145,7 +2147,7 @@ Value rfftp_factorize(OpBuilder &opBuilder, Location loc, Value nfctm1 = builder.create(loc, currnet_nfct_1, c1); index_SWAP(builder, loc, Rfftp_fctdata_fct, nfctm1, c0); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); // TODO: remove type1 and type2? @@ -2240,7 +2242,7 @@ Value rfftp_factorize(OpBuilder &opBuilder, Location loc, b.create(loc, maxl_index_index_1, c1); b.create(loc, maxl_final_1, maxl, c0); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); Value divisor_next = builder.create(loc, divisor, c2); @@ -2256,7 +2258,7 @@ Value rfftp_factorize(OpBuilder &opBuilder, Location loc, builder.create(loc, current_length1, Rfftp_fctdata_fct, current_nfct); index_increment(builder, loc, nfct); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); Value current_nfct1 = opBuilder.create(loc, nfct, c0); @@ -2308,19 +2310,19 @@ void my_sincosm1pi(OpBuilder &opBuilder, Location loc, Value a, Value res, Value s = opBuilder.create(loc, a, a); Value r1 = opBuilder.create( - loc, APFloat(double(-1.0369917389758117e-4)), f64Ty); + loc, f64Ty, APFloat(double(-1.0369917389758117e-4))); Value r2 = opBuilder.create( - loc, APFloat(double(1.9294935641298806e-3)), f64Ty); + loc, f64Ty, APFloat(double(1.9294935641298806e-3))); Value r3 = opBuilder.create( - loc, APFloat(double(-2.5806887942825395e-2)), f64Ty); + loc, f64Ty, APFloat(double(-2.5806887942825395e-2))); Value r4 = opBuilder.create( - loc, APFloat(double(2.3533063028328211e-1)), f64Ty); + loc, f64Ty, APFloat(double(2.3533063028328211e-1))); Value r5 = opBuilder.create( - loc, APFloat(double(-1.3352627688538006e+0)), f64Ty); + loc, f64Ty, APFloat(double(-1.3352627688538006e+0))); Value r6 = opBuilder.create( - loc, APFloat(double(4.0587121264167623e+0)), f64Ty); + loc, f64Ty, APFloat(double(4.0587121264167623e+0))); Value r7 = opBuilder.create( - loc, APFloat(double(-4.9348022005446790e+0)), f64Ty); + loc, f64Ty, APFloat(double(-4.9348022005446790e+0))); Value fma1 = opBuilder.create(loc, r1, s, r2); Value fma2 = opBuilder.create(loc, fma1, s, r3); @@ -2332,17 +2334,17 @@ void my_sincosm1pi(OpBuilder &opBuilder, Location loc, Value a, Value res, Value c = opBuilder.create(loc, fma6, s); Value r8 = opBuilder.create( - loc, APFloat(double(4.6151442520157035e-4)), f64Ty); + loc, f64Ty, APFloat(double(4.6151442520157035e-4))); Value r9 = opBuilder.create( - loc, APFloat(double(-7.3700183130883555e-3)), f64Ty); + loc, f64Ty, APFloat(double(-7.3700183130883555e-3))); Value r10 = opBuilder.create( - loc, APFloat(double(8.2145868949323936e-2)), f64Ty); + loc, f64Ty, APFloat(double(8.2145868949323936e-2))); Value r11 = opBuilder.create( - loc, APFloat(double(-5.9926452893214921e-1)), f64Ty); + loc, f64Ty, APFloat(double(-5.9926452893214921e-1))); Value r12 = opBuilder.create( - loc, APFloat(double(2.5501640398732688e+0)), f64Ty); + loc, f64Ty, APFloat(double(2.5501640398732688e+0))); Value r13 = opBuilder.create( - loc, APFloat(double(-5.1677127800499516e+0)), f64Ty); + loc, f64Ty, APFloat(double(-5.1677127800499516e+0))); Value fma7 = opBuilder.create(loc, r8, s, r9); Value fma8 = opBuilder.create(loc, fma7, s, r10); @@ -2354,7 +2356,7 @@ void my_sincosm1pi(OpBuilder &opBuilder, Location loc, Value a, Value res, Value r = opBuilder.create(loc, fma11, s_new); Value pi = opBuilder.create( - loc, APFloat(double(3.1415926535897931e+0)), f64Ty); + loc, f64Ty, APFloat(double(3.1415926535897931e+0))); Value s_final = opBuilder.create(loc, a, pi, r); opBuilder.create(loc, c, res_raw, c0); @@ -2395,19 +2397,19 @@ void calc_first_octant_extend2(OpBuilder &opBuilder, Location loc, Value den, SmallVector{remaining_size}, SmallVector{c1}); Value f2 = - opBuilder.create(loc, APFloat(double(2.0)), f64Ty); + opBuilder.create(loc, f64Ty, APFloat(double(2.0))); Value f1 = - opBuilder.create(loc, APFloat(double(1.0)), f64Ty); + opBuilder.create(loc, f64Ty, APFloat(double(1.0))); // TODO: remove f0? // Value f0 = - // opBuilder.create(loc, APFloat(double(0.0)), f64Ty); + // opBuilder.create(loc, f64Ty, APFloat(double(0.0))); Value n_f64 = index_to_f64(opBuilder, loc, n); Value l1_f64 = opBuilder.create(loc, n_f64); Value l1 = f64_to_index(opBuilder, loc, l1_f64); opBuilder.create( - loc, c1, l1, c1, std::nullopt, + loc, c1, l1, c1, ValueRange{}, [&](OpBuilder &builder, Location loc, Value i, ValueRange iargs) { Value i_f64 = index_to_f64(builder, loc, i); Value den_f64 = index_to_f64(builder, loc, den); @@ -2418,13 +2420,13 @@ void calc_first_octant_extend2(OpBuilder &opBuilder, Location loc, Value den, Value im2_bias = builder.create(loc, im2, bias); my_sincosm1pi(builder, loc, arg_scaled, res, im2_bias); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); Value start_start = opBuilder.create(loc, l1, c0); opBuilder.create( - loc, start_start, n, l1, std::nullopt, + loc, start_start, n, l1, ValueRange{}, [&](OpBuilder &builder, Location loc, Value start_loop, ValueRange start_loop_args) { Value start_f64 = index_to_f64(builder, loc, start_loop); @@ -2455,7 +2457,7 @@ void calc_first_octant_extend2(OpBuilder &opBuilder, Location loc, Value den, n_minus_start, end_1); builder.create( - loc, c1, end, c1, std::nullopt, + loc, c1, end, c1, ValueRange{}, [&](OpBuilder &b, Location loc, Value i, ValueRange i_args) { Value i_2 = b.create(loc, i, c2); Value csx0 = b.create(loc, res_raw, i_2); @@ -2483,21 +2485,21 @@ void calc_first_octant_extend2(OpBuilder &opBuilder, Location loc, Value den, b.create(loc, res_real, res_raw, start_plus_i_2); b.create(loc, res_imag, res_raw, start_plus_i_2_plus_1); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); builder.create(loc, cs); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); opBuilder.create( - loc, c1, l1, c1, std::nullopt, + loc, c1, l1, c1, ValueRange{}, [&](OpBuilder &builder, Location loc, Value i, ValueRange i_args) { Value i_2 = builder.create(loc, i, c2); Value val = builder.create(loc, res_raw, i_2); Value val_plus_1 = builder.create(loc, val, f1); builder.create(loc, val_plus_1, res_raw, i_2); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); return; @@ -2534,9 +2536,9 @@ void calc_first_octant_extend1(OpBuilder &opBuilder, Location loc, Value den, SmallVector{remaining_size}, SmallVector{c1}); Value f1 = - opBuilder.create(loc, APFloat(double(1.0)), f64Ty); + opBuilder.create(loc, f64Ty, APFloat(double(1.0))); Value f0 = - opBuilder.create(loc, APFloat(double(0.0)), f64Ty); + opBuilder.create(loc, f64Ty, APFloat(double(0.0))); opBuilder.create(loc, f1, res_raw, c0); opBuilder.create(loc, f0, res_raw, c1); @@ -2547,7 +2549,7 @@ void calc_first_octant_extend1(OpBuilder &opBuilder, Location loc, Value den, opBuilder.create( loc, condition, [&](OpBuilder &builder, Location loc) { calc_first_octant_extend2(builder, loc, den, res, bias); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); } @@ -2570,7 +2572,7 @@ void calc_first_octant(OpBuilder &opBuilder, Location loc, Value den, Value res, opBuilder.create( loc, condition, [&](OpBuilder &builder, Location loc) { calc_first_octant_extend1(builder, loc, den, res, bias); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); } @@ -2664,7 +2666,7 @@ void calc_first_quadrant(OpBuilder &opBuilder, Location loc, Value n, Value p_val_1 = builder.create(loc, p_raw, p_2i_plus_1); Value idx1_plus_1 = builder.create(loc, idx1_v, c1); builder.create(loc, p_val_1, res, idx1_plus_1); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); return; @@ -2683,10 +2685,10 @@ void calc_first_half(OpBuilder &opBuilder, Location loc, Value n, Value res) { FloatType f64Ty = opBuilder.getF64Type(); Value f0 = - opBuilder.create(loc, APFloat(double(0.0)), f64Ty); + opBuilder.create(loc, f64Ty, APFloat(double(0.0))); // TODO: remove f1? // Value f1 = - // opBuilder.create(loc, APFloat(double(1.0)), f64Ty); + // opBuilder.create(loc, f64Ty, APFloat(double(1.0))); Value n_plus_1 = opBuilder.create(loc, n, c1); Value ndone = opBuilder.create(loc, n_plus_1, c1); @@ -2879,7 +2881,7 @@ void fill_first_quadrant(OpBuilder &opBuilder, Location loc, Value n, FloatType f64Ty = opBuilder.getF64Type(); Value hsqt2 = opBuilder.create( - loc, APFloat(double(0.707106781186547524400844362104849)), f64Ty); + loc, f64Ty, APFloat(double(0.707106781186547524400844362104849))); Value quart = opBuilder.create(loc, n, c2); Value n_mod_8 = opBuilder.create(loc, n, c8); @@ -2892,7 +2894,7 @@ void fill_first_quadrant(OpBuilder &opBuilder, Location loc, Value n, Value quart_plus_1 = builder.create(loc, quart, c1); builder.create(loc, hsqt2, res, quart); builder.create(loc, hsqt2, res, quart_plus_1); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); Value two_quart = opBuilder.create(loc, quart, c2); @@ -2930,7 +2932,7 @@ void fill_first_half(OpBuilder &opBuilder, Location loc, Value n, Value res) { FloatType f64Ty = opBuilder.getF64Type(); Value c_1 = - opBuilder.create(loc, APFloat(double(-1.0)), f64Ty); + opBuilder.create(loc, f64Ty, APFloat(double(-1.0))); Value half = opBuilder.create(loc, n, c1); Value n_mod_4 = opBuilder.create(loc, n, c4); @@ -2942,7 +2944,7 @@ void fill_first_half(OpBuilder &opBuilder, Location loc, Value n, Value res) { loc, condition, [&](OpBuilder &builder, Location loc) { builder.create( - loc, c0, half, c2, std::nullopt, + loc, c0, half, c2, ValueRange{}, [&](OpBuilder &b, Location loc, Value i, ValueRange i_args) { Value i_plus_1 = b.create(loc, i, c1); Value i_plus_half = b.create(loc, i, half); @@ -2957,9 +2959,9 @@ void fill_first_half(OpBuilder &opBuilder, Location loc, Value n, Value res) { b.create(loc, neg_val_i_plus_1, res, i_plus_half); b.create(loc, val_i, res, i_plus_half_plus_1); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }, [&](OpBuilder &builder, Location loc) { Value two_half_minus_2 = builder.create(loc, half, c1); @@ -2982,7 +2984,7 @@ void fill_first_half(OpBuilder &opBuilder, Location loc, Value n, Value res) { b.create(loc, j_next); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); return; @@ -3011,7 +3013,7 @@ void sincos_2pibyn_half(OpBuilder &opBuilder, Location loc, Value n, fill_first_quadrant(builder, loc, n, res); fill_first_half(builder, loc, n, res); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }, [&](OpBuilder &builder, Location loc) { // TODO: remove the following values? @@ -3026,13 +3028,13 @@ void sincos_2pibyn_half(OpBuilder &opBuilder, Location loc, Value n, [&](OpBuilder &b, Location loc) { calc_first_quadrant(b, loc, n, res); fill_first_half(b, loc, n, res); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }, [&](OpBuilder &b, Location loc) { calc_first_half(b, loc, n, res); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); } @@ -3084,10 +3086,10 @@ Value rfftp_comp_twiddle(OpBuilder &opBuilder, Location loc, Value length, Value ido_m1_d2_p1 = b.create(loc, ido_m1_d2, c1); b.create( - loc, c1, ip, c1, std::nullopt, + loc, c1, ip, c1, ValueRange{}, [&](OpBuilder &b2, Location loc, Value j, ValueRange j_args) { b2.create( - loc, c1, ido_m1_d2_p1, c1, std::nullopt, + loc, c1, ido_m1_d2_p1, c1, ValueRange{}, [&](OpBuilder &b3, Location loc, Value i, ValueRange i_args) { Value j2 = b3.create(loc, j, c2); @@ -3121,12 +3123,12 @@ Value rfftp_comp_twiddle(OpBuilder &opBuilder, Location loc, Value length, b3.create(loc, twid_a, fct_k, tw_a); b3.create(loc, twid_b, fct_k, tw_b); - b3.create(loc, std::nullopt); + b3.create(loc, ValueRange{}); }); - b2.create(loc, std::nullopt); + b2.create(loc, ValueRange{}); }); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); Value condition2 = builder.create( @@ -3136,9 +3138,9 @@ Value rfftp_comp_twiddle(OpBuilder &opBuilder, Location loc, Value length, loc, condition2, [&](OpBuilder &b, Location loc) { Value fct_k = b.create(loc, Rfftp_fctdata_tws, k); Value c_f0 = - b.create(loc, APFloat(double(0.0)), f64Ty); + b.create(loc, f64Ty, APFloat(double(0.0))); Value c_f1 = - b.create(loc, APFloat(double(1.0)), f64Ty); + b.create(loc, f64Ty, APFloat(double(1.0))); b.create(loc, c_f1, fct_k, c0); b.create(loc, c_f0, fct_k, c1); @@ -3147,7 +3149,7 @@ Value rfftp_comp_twiddle(OpBuilder &opBuilder, Location loc, Value length, Value ip_div_2_p1 = b.create(loc, ip_div_2, c1); b.create( - loc, c1, ip_div_2_p1, c1, std::nullopt, + loc, c1, ip_div_2_p1, c1, ValueRange{}, [&](OpBuilder &b2, Location loc, Value i, ValueRange i_args) { Value i2 = b2.create(loc, i, c2); Value i2_p1 = b2.create(loc, i2, c1); @@ -3174,10 +3176,10 @@ Value rfftp_comp_twiddle(OpBuilder &opBuilder, Location loc, Value length, b2.create(loc, twid_b, fct_k, i2_p1); b2.create(loc, twid_c, fct_k, ip_m_i_2); b2.create(loc, twid_d, fct_k, ip_m_i_2_p1); - b2.create(loc, std::nullopt); + b2.create(loc, ValueRange{}); }); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); Value l1_next = builder.create(loc, l1, ip); @@ -3228,7 +3230,7 @@ std::vector make_rfftp_plan(OpBuilder &opBuilder, Location loc, opBuilder.create(loc, c0, Rfftp_plan_nfct, c0); opBuilder.create( - loc, c0, NFCT, c1, std::nullopt, + loc, c0, NFCT, c1, ValueRange{}, [&](OpBuilder &builder, Location loc, Value i, ValueRange iargs) { builder.create(loc, c0, Rfftp_fctdata_fct, i); @@ -3239,7 +3241,7 @@ std::vector make_rfftp_plan(OpBuilder &opBuilder, Location loc, loc, type1, /*dynamicOperands=*/length_2); builder.create(loc, tws_i, Rfftp_fctdata_tws, i); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); Value condition = opBuilder.create( @@ -3255,7 +3257,7 @@ std::vector make_rfftp_plan(OpBuilder &opBuilder, Location loc, rfftp_comp_twiddle(builder, loc, length, Rfftp_fctdata_fct, Rfftp_fctdata_tw, Rfftp_fctdata_tws, Rfftp_plan_length, Rfftp_plan_nfct, Rfftp_plan_mem); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); return {Rfftp_fctdata_fct, Rfftp_fctdata_tw, Rfftp_fctdata_tws, @@ -3274,14 +3276,14 @@ void memref_SWAP(OpBuilder &opBuilder, Location loc, Value p, Value p1) { Value length = opBuilder.create(loc, p, c0); opBuilder.create( - loc, c0, length, c1, std::nullopt, + loc, c0, length, c1, ValueRange{}, [&](OpBuilder builder, Location loc, Value i, ValueRange i_args) { Value val_p = builder.create(loc, p, i); Value val_p1 = builder.create(loc, p1, i); builder.create(loc, val_p, p1, i); builder.create(loc, val_p1, p, i); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); } @@ -3307,7 +3309,7 @@ void copy_and_norm(OpBuilder &opBuilder, Location loc, Value c, Value p1, // Value c3 = opBuilder.create(loc, 3); FloatType f64Ty = opBuilder.getF64Type(); Value f1 = - opBuilder.create(loc, APFloat(double(1.0)), f64Ty); + opBuilder.create(loc, f64Ty, APFloat(double(1.0))); Value flag_val = opBuilder.create(loc, flag, c0); Value condition = opBuilder.create( @@ -3322,26 +3324,26 @@ void copy_and_norm(OpBuilder &opBuilder, Location loc, Value c, Value p1, loc, condition1, [&](OpBuilder &b, Location loc) { b.create( - loc, c0, n, c1, std::nullopt, + loc, c0, n, c1, ValueRange{}, [&](OpBuilder b2, Location loc, Value i, ValueRange i_args) { Value p1_i = b2.create(loc, p1, i); Value v = b2.create(loc, fct, p1_i); b2.create(loc, v, c, i); - b2.create(loc, std::nullopt); + b2.create(loc, ValueRange{}); }); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }, [&](OpBuilder &b, Location loc) { b.create( - loc, c0, n, c1, std::nullopt, + loc, c0, n, c1, ValueRange{}, [&](OpBuilder b2, Location loc, Value i, ValueRange i_args) { Value val = b2.create(loc, p1, i); b2.create(loc, val, c, i); - b2.create(loc, std::nullopt); + b2.create(loc, ValueRange{}); }); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }, [&](OpBuilder &builder, Location loc) { Value condition2 = builder.create( @@ -3349,16 +3351,16 @@ void copy_and_norm(OpBuilder &opBuilder, Location loc, Value c, Value p1, builder.create( loc, condition2, [&](OpBuilder &b, Location loc) { b.create( - loc, c0, n, c1, std::nullopt, + loc, c0, n, c1, ValueRange{}, [&](OpBuilder &b2, Location loc, Value i, ValueRange i_args) { Value c_i = b2.create(loc, c, i); Value newC = b2.create(loc, fct, c_i); b2.create(loc, newC, c, i); - b2.create(loc, std::nullopt); + b2.create(loc, ValueRange{}); }); - b.create(loc, std::nullopt); + b.create(loc, ValueRange{}); }); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); } @@ -3436,7 +3438,7 @@ void rfftp_forward(OpBuilder &opBuilder, Location loc, Value Rfftp_fctdata_fct, loc, condition1, [&](OpBuilder &b2, Location loc) { radf4(b2, loc, p1_raw, p2_raw, tw, ido, l1, c0, c1, c2, c3); - b2.create(loc, std::nullopt); + b2.create(loc, ValueRange{}); }, [&](OpBuilder &b2, Location loc) { Value condition2 = b2.create( @@ -3445,7 +3447,7 @@ void rfftp_forward(OpBuilder &opBuilder, Location loc, Value Rfftp_fctdata_fct, loc, condition2, [&](OpBuilder &b3, Location loc) { radf2(b3, loc, p1_raw, p2_raw, tw, ido, l1); - b3.create(loc, std::nullopt); + b3.create(loc, ValueRange{}); }, [&](OpBuilder &b3, Location loc) { Value condition3 = b3.create( @@ -3454,7 +3456,7 @@ void rfftp_forward(OpBuilder &opBuilder, Location loc, Value Rfftp_fctdata_fct, loc, condition3, [&](OpBuilder &b4, Location loc) { radf3(b4, loc, p1_raw, p2_raw, tw, ido, l1); - b4.create(loc, std::nullopt); + b4.create(loc, ValueRange{}); }, [&](OpBuilder &b4, Location loc) { Value condition4 = b4.create( @@ -3465,7 +3467,7 @@ void rfftp_forward(OpBuilder &opBuilder, Location loc, Value Rfftp_fctdata_fct, radf5(b5, loc, p1_raw, p2_raw, tw, ido, l1, c0, c1, c2, c3, c4); b5.create(loc, - std::nullopt); + ValueRange{}); }, [&](OpBuilder &b5, Location loc) { Value tws = b5.create( @@ -3475,15 +3477,15 @@ void rfftp_forward(OpBuilder &opBuilder, Location loc, Value Rfftp_fctdata_fct, memref_SWAP(b5, loc, p1_raw, p2_raw); flag_SWAP(b5, loc, flag); b5.create(loc, - std::nullopt); + ValueRange{}); }); - b4.create(loc, std::nullopt); + b4.create(loc, ValueRange{}); }); - b3.create(loc, std::nullopt); + b3.create(loc, ValueRange{}); } ); - b2.create(loc, std::nullopt); + b2.create(loc, ValueRange{}); }); memref_SWAP(b, loc, p1_raw, p2_raw); @@ -3494,7 +3496,7 @@ void rfftp_forward(OpBuilder &opBuilder, Location loc, Value Rfftp_fctdata_fct, copy_and_norm(builder, loc, c, p1_raw, n, fct, flag); - builder.create(loc, std::nullopt); + builder.create(loc, ValueRange{}); }); } @@ -3581,7 +3583,7 @@ Value spectrogram(PatternRewriter &rewriter, Location loc, Value f0, Value c0, Value multiplied = mulfOp.getResult(0); Value bufferMem_raw = - builder.create(loc, mTp, multiplied); + builder.create(loc, mTp, multiplied); MemRefType type0 = MemRefType::get({400}, f64Ty); MemRefType type1 = MemRefType::get(ShapedType::kDynamic, f64Ty); @@ -3617,8 +3619,10 @@ Value spectrogram(PatternRewriter &rewriter, Location loc, Value f0, Value c0, /*permutation=*/ArrayRef{1, 0}); Value melFiltersT = transposeOp0.getResult()[0]; + auto spectrogramBufferType = llvm::cast(spectrogram.getType()); + auto spectrogramTensorType = RankedTensorType::get(spectrogramBufferType.getShape(), spectrogramBufferType.getElementType()); Value gram = rewriter.create( - loc, spectrogram, /*restrict=*/true, /*writable=*/false); + loc, spectrogramTensorType, spectrogram); Value init1 = rewriter.create( loc, ArrayRef{201, 3001}, f64Ty); auto transposeOp1 = rewriter.create( @@ -3639,7 +3643,7 @@ Value spectrogram(PatternRewriter &rewriter, Location loc, Value f0, Value c0, // Initialize a tensor with constant `1e-10`. RankedTensorType tensorTy1 = RankedTensorType::get({80, 3001}, f64Ty); Value cMelFloor = rewriter.create( - loc, APFloat(double(0.0000000001)), f64Ty); + loc, f64Ty, APFloat(double(0.0000000001))); Value melFloor = rewriter.create(loc, tensorTy1, cMelFloor); auto linalgMaxOp = rewriter.create( @@ -3694,8 +3698,10 @@ class DAPRFFTLowering : public OpRewritePattern { // Value c25 = rewriter.create(loc, 25); // Value c50 = rewriter.create(loc, 50); + auto bufferMemType = llvm::cast(bufferMem.getType()); + auto bufferTensorType = RankedTensorType::get(bufferMemType.getShape(), bufferMemType.getElementType()); Value inputFeatures = rewriter.create( - loc, bufferMem, /*restrict=*/true, /*writable=*/true); + loc, bufferTensorType, bufferMem); Value inputFeaturesSize = rewriter.create(loc, inputFeatures, c0); @@ -3703,9 +3709,9 @@ class DAPRFFTLowering : public OpRewritePattern { // TODO: remove the following values? // Value f0 = - // rewriter.create(loc, APFloat(double(0.0)), f64Ty); + // rewriter.create(loc, f64Ty, APFloat(double(0.0))); Value f1 = - rewriter.create(loc, APFloat(double(1.0)), f64Ty); + rewriter.create(loc, f64Ty, APFloat(double(1.0))); std::vector plan = make_rfftp_plan(rewriter, loc, inputFeaturesSize); @@ -3752,8 +3758,10 @@ class DAPWhisperPreprocessLowering FloatType f32 = Float32Type::get(ctx); FloatType f64 = Float64Type::get(ctx); + auto inputType = llvm::cast(input.getType()); + auto inputTensorType = RankedTensorType::get(inputType.getShape(), inputType.getElementType()); Value inputFeatures = rewriter.create( - loc, input, /*restrict=*/true, /*writable=*/false); + loc, inputTensorType, input); Value inputFeaturesSize = rewriter.create(loc, inputFeatures, c0); Value padConstantHigh = @@ -3769,7 +3777,7 @@ class DAPWhisperPreprocessLowering highValues.push_back(padConstantHigh); Value f0 = - rewriter.create(loc, APFloat(double(0.0)), f64); + rewriter.create(loc, f64, APFloat(double(0.0))); auto padConstantOp = rewriter.create( loc, RankedTensorType::get(paddedShape, f64), inputFeatures, lowValues, highValues, f0); @@ -3795,7 +3803,7 @@ class DAPWhisperPreprocessLowering Value logSpecCut = extractSliceOp.getResult(); Value maxInit = - rewriter.create(loc, APFloat(double(-10.0)), f64); + rewriter.create(loc, f64, APFloat(double(-10.0))); auto forOp0 = rewriter.create( loc, c0, c80, c1, maxInit, [&](OpBuilder &builder, Location loc, Value i, ValueRange iargs0) { @@ -3814,7 +3822,7 @@ class DAPWhisperPreprocessLowering }); Value maxNum = forOp0.getResults()[0]; - Value f8 = rewriter.create(loc, APFloat(double(8.0)), f64); + Value f8 = rewriter.create(loc, f64, APFloat(double(8.0))); Value maxNumMinus8 = rewriter.create(loc, maxNum, f8); Value logSpecFloor = rewriter.create( loc, RankedTensorType::get({80, 3000}, f64), maxNumMinus8); @@ -3825,8 +3833,8 @@ class DAPWhisperPreprocessLowering Value logSpecMax = linalgMaxOp.getResultTensors()[0]; Value f0F32 = - rewriter.create(loc, APFloat(float(0.0)), f32); - Value f4 = rewriter.create(loc, APFloat(double(4.0)), f64); + rewriter.create(loc, f32, APFloat(float(0.0))); + Value f4 = rewriter.create(loc, f64, APFloat(double(4.0))); RankedTensorType resultTy = RankedTensorType::get({80, 3000}, f32); Value InputFeaturesF32 = rewriter.create(loc, resultTy, f0F32); @@ -3868,7 +3876,7 @@ class DAPWhisperPreprocessLowering auto resultMemTp = MemRefType::get(expandTy.getShape(), expandTy.getElementType()); - Value resultMemRef = rewriter.create( + Value resultMemRef = rewriter.create( loc, resultMemTp, resultExpand); // Replace 'dap.whisper_preprocess' operation with the generated result. The diff --git a/midend/lib/Conversion/FuncBufferize/FuncBufferizePass.cpp b/midend/lib/Conversion/FuncBufferize/FuncBufferizePass.cpp index 038a44cde7..54e90383c3 100644 --- a/midend/lib/Conversion/FuncBufferize/FuncBufferizePass.cpp +++ b/midend/lib/Conversion/FuncBufferize/FuncBufferizePass.cpp @@ -104,7 +104,7 @@ void FuncBufferizeDynamicOffsetPass::runOnOperation() { StridedLayoutAttr::get(context, mlirShapedTypeGetDynamicSize(), ArrayRef(stride))); }); - typeConverter.addArgumentMaterialization(materializeToTensor); + typeConverter.addSourceMaterialization(materializeToTensor); typeConverter.addSourceMaterialization(materializeToTensor); typeConverter.addTargetMaterialization([](OpBuilder &builder, BaseMemRefType type, @@ -129,7 +129,7 @@ void FuncBufferizeDynamicOffsetPass::runOnOperation() { if (isa(inputs[0].getType())) { // Tensor to MemRef cast. - return builder.create(loc, type, inputs[0]); + return builder.create(loc, type, inputs[0]); } llvm_unreachable("only tensor/memref input types supported"); @@ -149,7 +149,7 @@ void FuncBufferizeDynamicOffsetPass::runOnOperation() { // Bufferize func's return op. populateReturnOpTypeConversionPattern(patterns, typeConverter); target.addLegalOp(); + bufferization::ToBufferOp>(); target.markUnknownOpDynamicallyLegal([&](Operation *op) { return isLegalForReturnOpTypeConversionPattern(op, typeConverter) || isNotBranchOpInterfaceOrReturnLikeOp(op); diff --git a/midend/lib/Conversion/LowerBud/LowerBudPass.cpp b/midend/lib/Conversion/LowerBud/LowerBudPass.cpp index 00313093c3..3d76c4c4e1 100644 --- a/midend/lib/Conversion/LowerBud/LowerBudPass.cpp +++ b/midend/lib/Conversion/LowerBud/LowerBudPass.cpp @@ -118,8 +118,8 @@ class BudTestArrayAttrLowering : public OpRewritePattern { auto loc = op.getLoc(); // Get the attribute and the value. ArrayAttr coordinateAttr = op.getCoordinate(); - int64_t valX = coordinateAttr[0].cast().getInt(); - int64_t valY = coordinateAttr[1].cast().getInt(); + int64_t valX = llvm::cast(coordinateAttr[0]).getInt(); + int64_t valY = llvm::cast(coordinateAttr[1]).getInt(); // Get the index attribute and constant value. IntegerAttr attrX = rewriter.getIntegerAttr(rewriter.getIndexType(), valX); IntegerAttr attrY = rewriter.getIntegerAttr(rewriter.getIndexType(), valY); diff --git a/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp b/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp index 3c94e74a4b..e1004fb435 100644 --- a/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp +++ b/midend/lib/Conversion/LowerDAP/LowerDAPPass.cpp @@ -60,7 +60,7 @@ class DAPFirLowering : public OpRewritePattern { Value c1 = rewriter.create(loc, 1); FloatType f32 = Float32Type::get(ctx); Value f0 = - rewriter.create(loc, APFloat(float(0.0)), f32); + rewriter.create(loc, f32, APFloat(float(0.0))); Value kernelSize = rewriter.create(loc, kernel, c0); Value dataLen = rewriter.create(loc, output, c0); @@ -155,8 +155,8 @@ class DAPBiquadLowering : public OpRewritePattern { FloatType f32 = Float32Type::get(ctx); - Value z1 = rewriter.create(loc, APFloat(float(0)), f32); - Value z2 = rewriter.create(loc, APFloat(float(0)), f32); + Value z1 = rewriter.create(loc, f32, APFloat(float(0))); + Value z2 = rewriter.create(loc, f32, APFloat(float(0))); VectorType vectorTy32 = VectorType::get({stride}, f32); @@ -267,9 +267,9 @@ class DAPIirLowering : public OpRewritePattern { builder.create(loc, kernel, ValueRange{iv, c5}); Value z1 = - builder.create(loc, APFloat(float(0)), f32); + builder.create(loc, f32, APFloat(float(0))); Value z2 = - builder.create(loc, APFloat(float(0)), f32); + builder.create(loc, f32, APFloat(float(0))); // Loop reordering, compute z1 for next iteration, z2 for the second // following iteration. diff --git a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp index 85bb870de5..044ea0da32 100644 --- a/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp +++ b/midend/lib/Conversion/LowerDIP/LowerDIPPass.cpp @@ -70,7 +70,7 @@ class DIPCorr2DOpLowering : public OpRewritePattern { dip::BoundaryOption boundaryOptionAttr = op.getBoundaryOption(); Value strideVal = rewriter.create(loc, stride); - auto inElemTy = input.getType().cast().getElementType(); + auto inElemTy = llvm::cast(input.getType()).getElementType(); dip::DIP_ERROR error = dip::checkDIPCommonTypes( op, {input, kernel, output, constantValue}); @@ -175,7 +175,7 @@ class DIPRotate2DOpLowering : public OpRewritePattern { Value angleVal = op->getOperand(1); Value output = op->getOperand(2); - auto inElemTy = input.getType().cast().getElementType(); + auto inElemTy = llvm::cast(input.getType()).getElementType(); dip::DIP_ERROR error = dip::checkDIPCommonTypes(op, {input, output}); @@ -234,7 +234,7 @@ class DIPRotate4DOpLowering : public OpRewritePattern { Value output = op->getOperand(2); auto imageFormatAttr = op.getImageFormat(); - auto inElemTy = input.getType().cast().getElementType(); + auto inElemTy = llvm::cast(input.getType()).getElementType(); dip::DIP_ERROR error = dip::checkDIPCommonTypes(op, {input, output}); @@ -302,7 +302,7 @@ class DIPResize2DOpLowering : public OpRewritePattern { auto interpolationAttr = op.getInterpolationType(); Value strideVal = rewriter.create(loc, stride); - auto inElemTy = input.getType().cast().getElementType(); + auto inElemTy = llvm::cast(input.getType()).getElementType(); dip::DIP_ERROR error = dip::checkDIPCommonTypes(op, {input, output}); @@ -427,7 +427,7 @@ class DIPResize4D_NHWCOpLowering auto interpolationAttr = op.getInterpolationType(); Value strideVal = rewriter.create(loc, stride); - auto inElemTy = input.getType().cast().getElementType(); + auto inElemTy = llvm::cast(input.getType()).getElementType(); dip::DIP_ERROR error = dip::checkDIPCommonTypes(op, {input, output}); @@ -568,7 +568,7 @@ class DIPResize4D_NCHWOpLowering auto interpolationAttr = op.getInterpolationType(); Value strideVal = rewriter.create(loc, stride); - auto inElemTy = input.getType().cast().getElementType(); + auto inElemTy = llvm::cast(input.getType()).getElementType(); dip::DIP_ERROR error = dip::checkDIPCommonTypes(op, {input, output}); @@ -712,7 +712,7 @@ class DIPErosion2DOpLowering : public OpRewritePattern { dip::BoundaryOption boundaryOptionAttr = op.getBoundaryOption(); Value strideVal = rewriter.create(loc, stride); - auto inElemTy = input.getType().cast().getElementType(); + auto inElemTy = llvm::cast(input.getType()).getElementType(); dip::DIP_ERROR error = dip::checkDIPCommonTypes( op, {input, kernel, output, copymemref, constantValue}); @@ -785,7 +785,7 @@ class DIPDilation2DOpLowering : public OpRewritePattern { dip::BoundaryOption boundaryOptionAttr = op.getBoundaryOption(); Value strideVal = rewriter.create(loc, stride); - auto inElemTy = input.getType().cast().getElementType(); + auto inElemTy = llvm::cast(input.getType()).getElementType(); dip::DIP_ERROR error = dip::checkDIPCommonTypes( op, {input, kernel, output, copymemref, constantValue}); @@ -858,7 +858,7 @@ class DIPOpening2DOpLowering : public OpRewritePattern { dip::BoundaryOption boundaryOptionAttr = op.getBoundaryOption(); Value strideVal = rewriter.create(loc, stride); - auto inElemTy = input.getType().cast().getElementType(); + auto inElemTy = llvm::cast(input.getType()).getElementType(); dip::DIP_ERROR error = dip::checkDIPCommonTypes( op, {input, kernel, output, output1, copymemref, copymemref1, constantValue}); @@ -953,7 +953,7 @@ class DIPClosing2DOpLowering : public OpRewritePattern { dip::BoundaryOption boundaryOptionAttr = op.getBoundaryOption(); Value strideVal = rewriter.create(loc, stride); - auto inElemTy = input.getType().cast().getElementType(); + auto inElemTy = llvm::cast(input.getType()).getElementType(); dip::DIP_ERROR error = dip::checkDIPCommonTypes( op, {input, kernel, output, output1, copymemref, copymemref1, constantValue}); @@ -1054,7 +1054,7 @@ class DIPTopHat2DOpLowering : public OpRewritePattern { dip::BoundaryOption boundaryOptionAttr = op.getBoundaryOption(); Value strideVal = rewriter.create(loc, stride); - auto inElemTy = input.getType().cast().getElementType(); + auto inElemTy = llvm::cast(input.getType()).getElementType(); auto bitWidth = inElemTy.getIntOrFloatBitWidth(); dip::DIP_ERROR error = dip::checkDIPCommonTypes( op, {input, kernel, output, output1, output2, input1, copymemref, @@ -1258,7 +1258,7 @@ class DIPBottomHat2DOpLowering : public OpRewritePattern { dip::BoundaryOption boundaryOptionAttr = op.getBoundaryOption(); Value strideVal = rewriter.create(loc, stride); - auto inElemTy = input.getType().cast().getElementType(); + auto inElemTy = llvm::cast(input.getType()).getElementType(); auto bitWidth = inElemTy.getIntOrFloatBitWidth(); dip::DIP_ERROR error = dip::checkDIPCommonTypes( op, {input, kernel, output, output1, output2, input1, copymemref, @@ -1456,7 +1456,7 @@ class DIPMorphGrad2DOpLowering : public OpRewritePattern { dip::BoundaryOption boundaryOptionAttr = op.getBoundaryOption(); Value strideVal = rewriter.create(loc, stride); - auto inElemTy = input.getType().cast().getElementType(); + auto inElemTy = llvm::cast(input.getType()).getElementType(); auto bitWidth = inElemTy.getIntOrFloatBitWidth(); dip::DIP_ERROR error = dip::checkDIPCommonTypes( op, {input, kernel, output, output1, output2, input1, copymemref, diff --git a/midend/lib/Conversion/LowerLinalgToGemmini/LowerLinalgToGemmini.cpp b/midend/lib/Conversion/LowerLinalgToGemmini/LowerLinalgToGemmini.cpp index bfee320cc4..31b0b1da93 100644 --- a/midend/lib/Conversion/LowerLinalgToGemmini/LowerLinalgToGemmini.cpp +++ b/midend/lib/Conversion/LowerLinalgToGemmini/LowerLinalgToGemmini.cpp @@ -233,7 +233,7 @@ class Conv2DNhwcHwcfLowering Value kernel = convOp.getInputs()[1]; Value output = convOp.getOutputs()[0]; Location loc = convOp.getLoc(); - MemRefType inputType = dyn_cast(input.getType()); + MemRefType inputType = llvm::cast(input.getType()); MemRefType kernelType = dyn_cast(kernel.getType()); MemRefType outputType = dyn_cast(output.getType()); Type kernelElemType = kernelType.getElementType(); diff --git a/midend/lib/Conversion/LowerVectorExp/LowerVectorExpPass.cpp b/midend/lib/Conversion/LowerVectorExp/LowerVectorExpPass.cpp index f2f3a9c683..e4221bf9fe 100644 --- a/midend/lib/Conversion/LowerVectorExp/LowerVectorExpPass.cpp +++ b/midend/lib/Conversion/LowerVectorExp/LowerVectorExpPass.cpp @@ -113,13 +113,13 @@ class VectorExpPredicationLowering } // Prepare the data pointer for the VP load operation. // - Call the `getStridedElementPtr` with above descriptor and indices. - Value dataPtr = this->getStridedElementPtr(loc, memRefTy, memDesc, - indices, rewriter); + Value dataPtr = this->getStridedElementPtr(rewriter, loc, memRefTy, memDesc, + indices); // Create VP load operation and replace the predication operation. // - Get the result type of the predication operation. // - Create VP load operation. // - Replace original predication operation. - VectorType resultType = op.getResult().getType().cast(); + VectorType resultType = llvm::cast(op.getResult().getType()); Value resultValue = rewriter.create( loc, resultType, dataPtr, op.getMask(), op.getVl()); rewriter.replaceOp(op, resultValue); @@ -146,8 +146,8 @@ class VectorExpPredicationLowering .getResult(0); indices.push_back(intIdx); } - Value dataPtr = this->getStridedElementPtr(loc, memRefTy, memDesc, - indices, rewriter); + Value dataPtr = this->getStridedElementPtr(rewriter, loc, memRefTy, memDesc, + indices); rewriter.create(loc, valueToStore, dataPtr, op.getMask(), op.getVl()); rewriter.eraseOp(op); diff --git a/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp b/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp index 318fd57524..55638f1589 100644 --- a/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp +++ b/midend/lib/Conversion/MatMulOptimization/BatchMatMulOptimize.cpp @@ -77,7 +77,7 @@ class BatchMatMulOptimizePattern : public ConversionPattern { IntegerType i1 = IntegerType::get(ctx, 1); VectorType vectorMaskTy = mlir::VectorType::get({vecSize}, i1); // Acquire the element type of input tensors. - Type elementType = A.getType().cast().getElementType(); + Type elementType = llvm::cast(A.getType()).getElementType(); VectorType vectorTy = mlir::VectorType::get({vecSize}, elementType); const AffineExpr d0 = rewriter.getAffineDimExpr(0); diff --git a/midend/lib/Conversion/MatMulOptimization/BatchMatMulSCFOptimize.cpp b/midend/lib/Conversion/MatMulOptimization/BatchMatMulSCFOptimize.cpp index 30b7cc9420..1bb2400819 100644 --- a/midend/lib/Conversion/MatMulOptimization/BatchMatMulSCFOptimize.cpp +++ b/midend/lib/Conversion/MatMulOptimization/BatchMatMulSCFOptimize.cpp @@ -71,7 +71,7 @@ class BatchMatMuSCFOptimizePattern : public ConversionPattern { Value C = op->getOperand(2); // Acquire the element type of input tensors. - Type elementType = A.getType().cast().getElementType(); + Type elementType = llvm::cast(A.getType()).getElementType(); // Define constants. const Value c0 = diff --git a/midend/lib/Conversion/MatMulOptimization/BatchMatMulTileOptimize.cpp b/midend/lib/Conversion/MatMulOptimization/BatchMatMulTileOptimize.cpp index 91d10c6456..6582e85d00 100644 --- a/midend/lib/Conversion/MatMulOptimization/BatchMatMulTileOptimize.cpp +++ b/midend/lib/Conversion/MatMulOptimization/BatchMatMulTileOptimize.cpp @@ -75,8 +75,8 @@ class BatchMatMulTileOptimizePattern : public ConversionPattern { Value C = op->getOperand(2); // Acquire the element type of input tensors. - Type elementType = A.getType().cast().getElementType(); - ShapedType ATy = A.getType().cast(); + Type elementType = llvm::cast(A.getType()).getElementType(); + ShapedType ATy = llvm::cast(A.getType()); // Define constants. const Value c0 = diff --git a/midend/lib/Conversion/MatMulOptimization/BatchMatMulTransBVec.cpp b/midend/lib/Conversion/MatMulOptimization/BatchMatMulTransBVec.cpp index 8c2a8707de..274b22df96 100644 --- a/midend/lib/Conversion/MatMulOptimization/BatchMatMulTransBVec.cpp +++ b/midend/lib/Conversion/MatMulOptimization/BatchMatMulTransBVec.cpp @@ -77,7 +77,7 @@ class BatchMatMulTransVecPattern : public ConversionPattern { IntegerType i1 = IntegerType::get(ctx, 1); VectorType vectorMaskTy = mlir::VectorType::get({vecSize}, i1); // Acquire the element type of input tensors. - Type elementType = A.getType().cast().getElementType(); + Type elementType = llvm::cast(A.getType()).getElementType(); VectorType vectorTy = mlir::VectorType::get({vecSize}, elementType); // Define constants. diff --git a/midend/lib/Conversion/MatMulOptimization/MatMulOptimize.cpp b/midend/lib/Conversion/MatMulOptimization/MatMulOptimize.cpp index 12aa552aec..f3adf091f6 100644 --- a/midend/lib/Conversion/MatMulOptimization/MatMulOptimize.cpp +++ b/midend/lib/Conversion/MatMulOptimization/MatMulOptimize.cpp @@ -56,9 +56,9 @@ class MatMulOptimizePattern : public ConversionPattern { Value B = op->getOperand(1); Value C = op->getOperand(2); // Get shape of input and output - ShapedType ATy = A.getType().cast(); - // ShapedType BTy = B.getType().cast(); - // ShapedType CTy = C.getType().cast(); + ShapedType ATy = llvm::cast(A.getType()); + // ShapedType BTy = B.getType()); + // ShapedType CTy = C.getType()); // Some constants. const Value c0 = @@ -134,7 +134,7 @@ class MatMulOptimizePattern : public ConversionPattern { for (int i = 0; i < kernelM; ++i) { Value a = builder.create( loc, vTy, aptrs[i], ValueRange{c0, ivK}, - mapBroadcast); + /*padding=*/std::nullopt, mapBroadcast); as.push_back(a); } SmallVector ds; @@ -144,7 +144,8 @@ class MatMulOptimizePattern : public ConversionPattern { Value fixedIV = builder.create( loc, AffineMap::get(1, 0, d0 + j * vecSize), ivJ); Value d = builder.create( - loc, vTy, c, ValueRange{c0, fixedIV}); + loc, vTy, c, ValueRange{c0, fixedIV}, + /*padding=*/std::nullopt); ds.push_back(d); } } @@ -155,7 +156,8 @@ class MatMulOptimizePattern : public ConversionPattern { loc, AffineMap::get(1, 0, d0 + i * vecSize), ivJ); } Value b = builder.create( - loc, vTy, B, ValueRange{ivK, fixedIV}); + loc, vTy, B, ValueRange{ivK, fixedIV}, + /*padding=*/std::nullopt); bs.push_back(b); } diff --git a/midend/lib/Conversion/MatMulOptimization/MatMulParallelVectorization.cpp b/midend/lib/Conversion/MatMulOptimization/MatMulParallelVectorization.cpp index 23d0ef4e7b..b34b39a7e9 100644 --- a/midend/lib/Conversion/MatMulOptimization/MatMulParallelVectorization.cpp +++ b/midend/lib/Conversion/MatMulOptimization/MatMulParallelVectorization.cpp @@ -72,7 +72,7 @@ class MatMulParallelVectorizationPattern : public ConversionPattern { Value C = op->getOperand(2); // Acquire the element type of input tensors. - Type elementType = A.getType().cast().getElementType(); + Type elementType = llvm::cast(A.getType()).getElementType(); // Define constants. const Value zeroIndex = @@ -140,8 +140,8 @@ class MatMulParallelVectorizationPattern : public ConversionPattern { ArrayRef{aRow, bRow}, false, 3, true); // Compile time branch detection. - if (C.getType().cast().isDynamicDim(1) or - C.getType().cast().getDimSize(1) % affineVectorSize != 0) { + if (llvm::cast(C.getType()).isDynamicDim(1) or + llvm::cast(C.getType()).getDimSize(1) % affineVectorSize != 0) { // Depending on the position, use either full vectors or tail vectors. affine::AffineIfOp branchingOp = rewriter.create( diff --git a/midend/lib/Conversion/MatMulOptimization/MatMulTransposeBVec.cpp b/midend/lib/Conversion/MatMulOptimization/MatMulTransposeBVec.cpp index 91ef00d4d2..46a4d07a1e 100644 --- a/midend/lib/Conversion/MatMulOptimization/MatMulTransposeBVec.cpp +++ b/midend/lib/Conversion/MatMulOptimization/MatMulTransposeBVec.cpp @@ -62,7 +62,7 @@ class MatMulTransposeBVecPattern : public ConversionPattern { Value C = op->getOperand(2); // Get shape of input and output. - ShapedType ATy = A.getType().cast(); + ShapedType ATy = llvm::cast(A.getType()); Type eleTy = ATy.getElementType(); // the element type for mask vector. diff --git a/midend/lib/Conversion/MatMulOptimization/MatMulVectorization.cpp b/midend/lib/Conversion/MatMulOptimization/MatMulVectorization.cpp index 4b2c87eb26..0cbf066e12 100644 --- a/midend/lib/Conversion/MatMulOptimization/MatMulVectorization.cpp +++ b/midend/lib/Conversion/MatMulOptimization/MatMulVectorization.cpp @@ -55,10 +55,10 @@ class MatMulVectorizationPattern : public ConversionPattern { Value B = op->getOperand(1); Value C = op->getOperand(2); // Get shape of input and output - ShapedType ATy = A.getType().cast(); + ShapedType ATy = llvm::cast(A.getType()); Type eleTy = ATy.getElementType(); - // ShapedType BTy = B.getType().cast(); - // ShapedType CTy = C.getType().cast(); + // ShapedType BTy = B.getType()); + // ShapedType CTy = C.getType()); auto ctx = op->getContext(); // Get i1 as the element type for mask vector. @@ -96,7 +96,7 @@ class MatMulVectorizationPattern : public ConversionPattern { // Create loop based on vector size. builder.create( loc, ValueRange{c0}, builder.getDimIdentityMap(), - ValueRange{bCol}, vecTailMap, /*Step=*/1, std::nullopt, + ValueRange{bCol}, vecTailMap, /*Step=*/1, ValueRange{}, [&](OpBuilder &nestedBuilder, Location nestedLoc, Value iv, ValueRange itrArgs) { // Load element and broadcast to vector. diff --git a/midend/lib/Conversion/TransposeOptimization/BuiltinTransposeVectorization.cpp b/midend/lib/Conversion/TransposeOptimization/BuiltinTransposeVectorization.cpp index 0cb8761a72..353f1e1085 100644 --- a/midend/lib/Conversion/TransposeOptimization/BuiltinTransposeVectorization.cpp +++ b/midend/lib/Conversion/TransposeOptimization/BuiltinTransposeVectorization.cpp @@ -52,6 +52,9 @@ using namespace affine; namespace { class TransposeOptimizationPattern : public ConversionPattern { +private: + int64_t affineVectorSize; + public: explicit TransposeOptimizationPattern(MLIRContext *context, int64_t affineVectorSizeParam) @@ -63,8 +66,7 @@ class TransposeOptimizationPattern : public ConversionPattern { matchAndRewrite(Operation *op, ArrayRef /*operands*/, ConversionPatternRewriter &rewriter) const override { auto permutationArrayAttr = - op->getAttr(rewriter.getStringAttr("permutation")) - .cast() + llvm::cast(op->getAttr(rewriter.getStringAttr("permutation"))) .asArrayRef(); // Retrieve input tensors A, B. @@ -73,14 +75,14 @@ class TransposeOptimizationPattern : public ConversionPattern { // Only to rewrite the rank 2 tensor transpose. if (permutationArrayAttr[0] != 1 or permutationArrayAttr[1] != 0 or - A.getType().cast().getRank() != 2) { + llvm::cast(A.getType()).getRank() != 2) { return failure(); } auto loc = op->getLoc(); // Acquire the element type of input tensors. - Type elementType = A.getType().cast().getElementType(); + Type elementType = llvm::cast(A.getType()).getElementType(); // Define constants. const Value index0 = @@ -204,8 +206,8 @@ class TransposeOptimizationPattern : public ConversionPattern { }); // Compile time branch detection. - if (A.getType().cast().isDynamicDim(0) or - A.getType().cast().getDimSize(0) % affineVectorSize != + if (llvm::cast(A.getType()).isDynamicDim(0) or + llvm::cast(A.getType()).getDimSize(0) % affineVectorSize != 0) { // Depending on the position, use either full vectors or tail // vectors. @@ -264,8 +266,8 @@ class TransposeOptimizationPattern : public ConversionPattern { parallelColLoop.getRegion().push_back(loopBody); rewriter.setInsertionPointAfter(parallelColLoop); - if (A.getType().cast().isDynamicDim(1) or - A.getType().cast().getDimSize(1) % affineVectorSize != 0) { + if (llvm::cast(A.getType()).isDynamicDim(1) or + llvm::cast(A.getType()).getDimSize(1) % affineVectorSize != 0) { affine::AffineIfOp branchingColUnaligned = rewriter.create( @@ -324,8 +326,8 @@ class TransposeOptimizationPattern : public ConversionPattern { }); }); - if (A.getType().cast().isDynamicDim(0) or - A.getType().cast().getDimSize(0) % affineVectorSize != + if (llvm::cast(A.getType()).isDynamicDim(0) or + llvm::cast(A.getType()).getDimSize(0) % affineVectorSize != 0) { affine::AffineIfOp branchingRowColUnaligned = trueColUnalignedBranchBuilder.create( @@ -394,9 +396,6 @@ class TransposeOptimizationPattern : public ConversionPattern { rewriter.eraseOp(op); return success(); } - -private: - int64_t affineVectorSize; }; } // end anonymous namespace diff --git a/midend/lib/Dialect/GPU/TransformOps.cpp b/midend/lib/Dialect/GPU/TransformOps.cpp index 227b28ac70..2a0e4ea2aa 100644 --- a/midend/lib/Dialect/GPU/TransformOps.cpp +++ b/midend/lib/Dialect/GPU/TransformOps.cpp @@ -168,7 +168,6 @@ DiagnosedSilenceableFailure buddy::gpu::VectorToMMAConversionOp::applyToOne( MLIRContext *ctx = target->getContext(); mlir::transform::ErrorCheckingTrackingListener listener(state, *this); GreedyRewriteConfig config; - config.listener = &listener; // Unrolling to native vector size must have previously occurred. // TODO: Add pattern to propagate the extract through the scf.for @@ -177,7 +176,7 @@ DiagnosedSilenceableFailure buddy::gpu::VectorToMMAConversionOp::applyToOne( mlir::vector::populateCastAwayVectorLeadingOneDimPatterns(patterns); populatePrepareVectorToMMAPatterns(patterns, getUseMmaSync()); if (failed( - applyPatternsAndFoldGreedily(target, std::move(patterns), config))) { + applyPatternsGreedily(target, std::move(patterns), config))) { target->emitOpError("vector to mma preparation patterns failed to apply"); return emitDefaultDefiniteFailure(target); } @@ -198,7 +197,7 @@ DiagnosedSilenceableFailure buddy::gpu::VectorToMMAConversionOp::applyToOne( RewritePatternSet f32ToTF32patterns(funcOp.getContext()); nvgpu::populateMmaSyncF32ToTF32Patterns(f32ToTF32patterns, nvgpu::MmaSyncF32Lowering::TF32); - if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(f32ToTF32patterns), + if (failed(applyPatternsGreedily(funcOp, std::move(f32ToTF32patterns), config))) return mlir::emitDefiniteFailure( target, "vector to mma F32ToTF32 patterns failed to apply"); diff --git a/midend/lib/Dialect/Gemmini/Transforms/LegalizeForLLVMExport.cpp b/midend/lib/Dialect/Gemmini/Transforms/LegalizeForLLVMExport.cpp index a6ef9a2885..2f53d207e3 100644 --- a/midend/lib/Dialect/Gemmini/Transforms/LegalizeForLLVMExport.cpp +++ b/midend/lib/Dialect/Gemmini/Transforms/LegalizeForLLVMExport.cpp @@ -1894,7 +1894,7 @@ class GemminiTileConvLowering : public ConvertOpToLLVMPattern { Value output = tileConvOp.getOutput(); Value weights = tileConvOp.getWeights(); Value bias = tileConvOp.getBias(); - MemRefType inputType = dyn_cast(input.getType()); + MemRefType inputType = dyn_cast(llvm::cast(input.getType())); MemRefType biasType = dyn_cast(bias.getType()); ArrayRef inputShape = inputType.getShape(); ArrayRef biasShape = biasType.getShape(); diff --git a/midend/lib/Dialect/RVV/Transforms/LegalizeForLLVMExport.cpp b/midend/lib/Dialect/RVV/Transforms/LegalizeForLLVMExport.cpp index 6ddb238f33..f242be9bbf 100644 --- a/midend/lib/Dialect/RVV/Transforms/LegalizeForLLVMExport.cpp +++ b/midend/lib/Dialect/RVV/Transforms/LegalizeForLLVMExport.cpp @@ -160,7 +160,7 @@ struct RVVLoadOpLowering : public ConvertOpToLLVMPattern { rewriter.create(loadOp.getLoc(), resultType); LLVM::LLVMPointerType llvmDataTypePtr = LLVM::LLVMPointerType::get(context); Value dataPtr = getStridedElementPtr( - loadOp.getLoc(), type, adaptor.getBase(), adaptor.getIndex(), rewriter); + rewriter, loadOp.getLoc(), type, adaptor.getBase(), adaptor.getIndex()); Value bitCastedPtr = rewriter.create( loadOp.getLoc(), llvmDataTypePtr, dataPtr); Value vl = loadOp.getOperand(2); @@ -191,8 +191,8 @@ struct RVVStoreOpLowering : public ConvertOpToLLVMPattern { auto context = storeOp.getContext(); LLVM::LLVMPointerType llvmDataTypePtr = LLVM::LLVMPointerType::get(context); Value dataPtr = - getStridedElementPtr(storeOp.getLoc(), type, adaptor.getBase(), - adaptor.getIndex(), rewriter); + getStridedElementPtr(rewriter, storeOp.getLoc(), type, adaptor.getBase(), + adaptor.getIndex()); Value bitCastedPtr = rewriter.create( storeOp.getLoc(), llvmDataTypePtr, dataPtr); Value vl = storeOp.getOperand(3); diff --git a/midend/lib/Utils/AffineTransformUtils.cpp b/midend/lib/Utils/AffineTransformUtils.cpp index 54f328c5a2..753b9e4a5a 100644 --- a/midend/lib/Utils/AffineTransformUtils.cpp +++ b/midend/lib/Utils/AffineTransformUtils.cpp @@ -252,7 +252,7 @@ void remapNearest2D(OpBuilder &builder, Location loc, MLIRContext *ctx, }, [&](OpBuilder &elseBuilder, Location elseLoc) { auto inElemTy = - input.getType().cast().getElementType(); + llvm::cast(input.getType()).getElementType(); Value pixel = insertZeroConstantOp(ctx, elseBuilder, elseLoc, inElemTy); elseBuilder.create(elseLoc, pixel, output, @@ -364,8 +364,7 @@ void remapNearest3D(OpBuilder &builder, Location loc, MLIRContext *ctx, thenBuilder.create(thenLoc); }, [&](OpBuilder &elseBuilder, Location elseLoc) { - auto inElemTy = input.getType() - .cast() + auto inElemTy = llvm::cast(input.getType()) .getElementType(); Value pixel = insertZeroConstantOp(ctx, elseBuilder, elseLoc, inElemTy); @@ -445,8 +444,7 @@ void remapNearest3D(OpBuilder &builder, Location loc, MLIRContext *ctx, thenBuilder.create(elseLoc); }, [&](OpBuilder &elseBuilder, Location elseLoc) { - auto inElemTy = input.getType() - .cast() + auto inElemTy = llvm::cast(input.getType()) .getElementType(); Value pixel = insertZeroConstantOp(ctx, elseBuilder, elseLoc, inElemTy); diff --git a/midend/lib/Utils/DIPUtils.cpp b/midend/lib/Utils/DIPUtils.cpp index 8cbbc2be61..efce95d120 100644 --- a/midend/lib/Utils/DIPUtils.cpp +++ b/midend/lib/Utils/DIPUtils.cpp @@ -96,7 +96,7 @@ DIP_ERROR checkDIPCommonTypes(DIPOP op, const std::vector &args) { const auto getType = [&](int argIndex) { return args[argIndex].getType(); }; const auto getElementType = [&](int argIndex) { - return getType(argIndex).template cast().getElementType(); + return llvm::cast(getType(argIndex)).getElementType(); }; // NB: we can infer element type for all related memrefs to be the same as @@ -330,10 +330,10 @@ std::vector standardRotate(OpBuilder &builder, Location loc, // Get center co-ordinates w.r.t given dimension. Value getCenter(OpBuilder &builder, Location loc, MLIRContext *ctx, Value dim) { Value dimF32 = indexToF32(builder, loc, dim); - Value c1f = builder.create(loc, (llvm::APFloat)1.0f, - builder.getF32Type()); - Value c2f = builder.create(loc, (llvm::APFloat)2.0f, - builder.getF32Type()); + Value c1f = builder.create(loc, builder.getF32Type(), + (llvm::APFloat)1.0f); + Value c2f = builder.create(loc, builder.getF32Type(), + (llvm::APFloat)2.0f); Value temp1 = builder.create(loc, dimF32, c1f); Value temp2 = builder.create(loc, temp1, c2f); @@ -455,8 +455,8 @@ void fillPixelsNearestNeighbour4D( // Calculate tan(angle / 2) where angle is a function parameter. Value customTanVal(OpBuilder &builder, Location loc, Value angleVal) { - Value c2F32 = builder.create(loc, (llvm::APFloat)2.0f, - builder.getF32Type()); + Value c2F32 = builder.create(loc, builder.getF32Type(), + (llvm::APFloat)2.0f); Value angleVal_2 = builder.create(loc, angleVal, c2F32); Value sinVal = builder.create(loc, angleVal_2); diff --git a/midend/lib/Utils/GPUUtils.cpp b/midend/lib/Utils/GPUUtils.cpp index 921fa29e15..b6215d345f 100644 --- a/midend/lib/Utils/GPUUtils.cpp +++ b/midend/lib/Utils/GPUUtils.cpp @@ -81,7 +81,7 @@ gpuMmaUnrollOrder(vector::ContractionOp contract) { llvm::SmallDenseSet dims; for (AffineExpr expr : contract.getIndexingMapsArray()[0].getResults()) { - dims.insert(expr.cast().getPosition()); + dims.insert(llvm::cast(expr).getPosition()); } // Then parallel dimensions that are part of Lhs as we want to re-use Lhs. for (auto [index, iter] : llvm::enumerate(contract.getIteratorTypes())) { @@ -115,7 +115,7 @@ static Value promoteElementToVector(Location loc, OpBuilder &builder, Value packVectorToSupportedWidth(Location loc, OpBuilder &builder, Value input) { LLVM_DEBUG({ - auto vecType = input.getType().cast(); + auto vecType = llvm::cast(input.getType()); Type elementType = vecType.getElementType(); assert(vecType.getDimSize(0) * elementType.getIntOrFloatBitWidth() == kShuffleBitWidth && diff --git a/midend/lib/Utils/Utils.cpp b/midend/lib/Utils/Utils.cpp index 4910d23197..2ab618e3e6 100644 --- a/midend/lib/Utils/Utils.cpp +++ b/midend/lib/Utils/Utils.cpp @@ -22,6 +22,11 @@ #ifndef UTILS_UTILS_DEF #define UTILS_UTILS_DEF +#include +#ifndef M_PI +#define M_PI 3.14159265358979323846 +#endif + #include "mlir/IR/BuiltinTypes.h" #include #include @@ -51,10 +56,10 @@ Value insertZeroConstantOp(MLIRContext *ctx, OpBuilder &builder, Location loc, ? static_cast(Float32Type::get(ctx)) : static_cast(Float64Type::get(ctx)); auto zero = APFloat::getZero(floatType.getFloatSemantics()); - op = builder.create(loc, zero, floatType); + op = builder.create(loc, floatType, zero); } else if (elemTy.isInteger(bitWidth)) { IntegerType type = IntegerType::get(ctx, bitWidth); - op = builder.create(loc, 0, type); + op = builder.create(loc, type, 0); } return op; @@ -302,7 +307,7 @@ void idft1DCooleyTukeyButterfly(OpBuilder &builder, Location loc, builder.create( loc, indexToF32(builder, loc, memRefLength))); Value pos2MPI = builder.create( - loc, (llvm::APFloat)(float)(2.0 * M_PI), builder.getF32Type()); + loc, builder.getF32Type(), (llvm::APFloat)(float)(2.0 * M_PI)); builder.create( loc, c0, upperBound, c1, ValueRange{subProbs, half}, @@ -326,9 +331,9 @@ void idft1DCooleyTukeyButterfly(OpBuilder &builder, Location loc, jBegin = builder.create(loc, iv1[0], subProbSize); jEnd = builder.create(loc, jBegin, outerIterVR[1]); wReal = builder.create( - loc, (llvm::APFloat)1.0f, builder.getF32Type()); + loc, builder.getF32Type(), (llvm::APFloat)1.0f); wImag = builder.create( - loc, (llvm::APFloat)0.0f, builder.getF32Type()); + loc, builder.getF32Type(), (llvm::APFloat)0.0f); wRealVec = builder.create(loc, vecType, wReal); @@ -442,7 +447,7 @@ void dft1DGentlemanSandeButterfly(OpBuilder &builder, Location loc, builder.create( loc, indexToF32(builder, loc, memRefLength))); Value neg2MPI = builder.create( - loc, (llvm::APFloat)(float)(-2.0 * M_PI), builder.getF32Type()); + loc, builder.getF32Type(), (llvm::APFloat)(float)(-2.0 * M_PI)); builder.create( loc, c0, upperBound, c1, ValueRange{subProbs, subProbSize}, @@ -467,9 +472,9 @@ void dft1DGentlemanSandeButterfly(OpBuilder &builder, Location loc, builder.create(loc, iv1[0], outerIterVR[1]); jEnd = builder.create(loc, jBegin, half); wReal = builder.create( - loc, (llvm::APFloat)1.0f, builder.getF32Type()); + loc, builder.getF32Type(), (llvm::APFloat)1.0f); wImag = builder.create( - loc, (llvm::APFloat)0.0f, builder.getF32Type()); + loc, builder.getF32Type(), (llvm::APFloat)0.0f); wRealVec = builder.create(loc, vecType, wReal); diff --git a/rope-b2fe557ff-llvm21rc2-lo-tosa.mlir b/rope-b2fe557ff-llvm21rc2-lo-tosa.mlir new file mode 100644 index 0000000000..58aa3f3e10 --- /dev/null +++ b/rope-b2fe557ff-llvm21rc2-lo-tosa.mlir @@ -0,0 +1,139 @@ +#map = affine_map<(d0, d1, d2) -> (d0, 0, d1, d2)> +#map1 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +#map2 = affine_map<(d0, d1) -> (0, d0, d1)> +#map3 = affine_map<(d0, d1) -> (d0, d1)> +#map4 = affine_map<(d0, d1, d2) -> (d0, d1)> +#map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#map6 = affine_map<(d0, d1, d2, d3) -> (d0, 0, d2, d3)> +module { + func.func private @rtclock() -> f64 + func.func @kernel(%arg0: tensor<1x40x4096xf32>, %arg1: tensor<1x40x4096xf32>, %arg2: tensor<1x40x4096xf32>, %arg3: tensor<1x1x2048x128xf32>, %arg4: tensor<1x1x2048x128xf32>, %arg5: tensor<1x40xi64>) { + %0 = call @rtclock() : () -> f64 + %1 = tosa.const_shape {values = dense<[1, 40, 32, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %expanded = tensor.expand_shape %arg0 [[0], [1], [2, 3]] output_shape [1, 40, 32, 128] : tensor<1x40x4096xf32> into tensor<1x40x32x128xf32> + %2 = tensor.empty() : tensor<1x32x40x128xf32> + %transposed = linalg.transpose ins(%expanded : tensor<1x40x32x128xf32>) outs(%2 : tensor<1x32x40x128xf32>) permutation = [0, 2, 1, 3] + %3 = tosa.const_shape {values = dense<[1, 40, 32, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %expanded_0 = tensor.expand_shape %arg1 [[0], [1], [2, 3]] output_shape [1, 40, 32, 128] : tensor<1x40x4096xf32> into tensor<1x40x32x128xf32> + %4 = tensor.empty() : tensor<1x32x40x128xf32> + %transposed_1 = linalg.transpose ins(%expanded_0 : tensor<1x40x32x128xf32>) outs(%4 : tensor<1x32x40x128xf32>) permutation = [0, 2, 1, 3] + %5 = tosa.const_shape {values = dense<[1, 40, 32, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %expanded_2 = tensor.expand_shape %arg2 [[0], [1], [2, 3]] output_shape [1, 40, 32, 128] : tensor<1x40x4096xf32> into tensor<1x40x32x128xf32> + %6 = tensor.empty() : tensor<1x32x40x128xf32> + %transposed_3 = linalg.transpose ins(%expanded_2 : tensor<1x40x32x128xf32>) outs(%6 : tensor<1x32x40x128xf32>) permutation = [0, 2, 1, 3] + %extracted_slice = tensor.extract_slice %arg3[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32> + %extracted_slice_4 = tensor.extract_slice %arg4[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32> + %7 = tensor.empty() : tensor<1x40x128xf32> + %8 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<1x1x40x128xf32>) outs(%7 : tensor<1x40x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x40x128xf32> + %9 = tensor.empty() : tensor<40x128xf32> + %10 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%8 : tensor<1x40x128xf32>) outs(%9 : tensor<40x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<40x128xf32> + %11 = tensor.empty() : tensor<1x40x128xf32> + %12 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_4 : tensor<1x1x40x128xf32>) outs(%11 : tensor<1x40x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x40x128xf32> + %13 = tensor.empty() : tensor<40x128xf32> + %14 = linalg.generic {indexing_maps = [#map2, #map3], iterator_types = ["parallel", "parallel"]} ins(%12 : tensor<1x40x128xf32>) outs(%13 : tensor<40x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<40x128xf32> + %15 = tensor.empty() : tensor<1x40x128xf32> + %16 = linalg.generic {indexing_maps = [#map4, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg5 : tensor<1x40xi64>) outs(%15 : tensor<1x40x128xf32>) { + ^bb0(%in: i64, %out: f32): + %37 = arith.index_cast %in : i64 to index + %38 = linalg.index 2 : index + %extracted = tensor.extract %10[%37, %38] : tensor<40x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x40x128xf32> + %17 = tosa.const_shape {values = dense<[1, 1, 40, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %expanded_5 = tensor.expand_shape %16 [[0, 1], [2], [3]] output_shape [1, 1, 40, 128] : tensor<1x40x128xf32> into tensor<1x1x40x128xf32> + %18 = tensor.empty() : tensor<1x40x128xf32> + %19 = linalg.generic {indexing_maps = [#map4, #map1], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg5 : tensor<1x40xi64>) outs(%18 : tensor<1x40x128xf32>) { + ^bb0(%in: i64, %out: f32): + %37 = arith.index_cast %in : i64 to index + %38 = linalg.index 2 : index + %extracted = tensor.extract %14[%37, %38] : tensor<40x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x40x128xf32> + %20 = tosa.const_shape {values = dense<[1, 1, 40, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %expanded_6 = tensor.expand_shape %19 [[0, 1], [2], [3]] output_shape [1, 1, 40, 128] : tensor<1x40x128xf32> into tensor<1x1x40x128xf32> + %cst = arith.constant dense<0> : tensor<1xi8> + %21 = tensor.empty() : tensor<1x32x40x128xf32> + %22 = linalg.generic {indexing_maps = [#map5, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed, %expanded_5 : tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) outs(%21 : tensor<1x32x40x128xf32>) { + ^bb0(%in: f32, %in_19: f32, %out: f32): + %37 = arith.mulf %in, %in_19 : f32 + linalg.yield %37 : f32 + } -> tensor<1x32x40x128xf32> + %extracted_slice_7 = tensor.extract_slice %transposed[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> + %extracted_slice_8 = tensor.extract_slice %transposed[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> + %cst_9 = arith.constant dense<0.000000e+00> : tensor<1xf32> + %cst_10 = arith.constant dense<0.000000e+00> : tensor<1xf32> + %23 = tensor.empty() : tensor<1x32x40x64xf32> + %24 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_8 : tensor<1x32x40x64xf32>) outs(%23 : tensor<1x32x40x64xf32>) { + ^bb0(%in: f32, %out: f32): + %37 = arith.negf %in : f32 + linalg.yield %37 : f32 + } -> tensor<1x32x40x64xf32> + %25 = tensor.empty() : tensor<1x32x40x128xf32> + %inserted_slice = tensor.insert_slice %24 into %25[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> + %inserted_slice_11 = tensor.insert_slice %extracted_slice_7 into %inserted_slice[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> + %cst_12 = arith.constant dense<0> : tensor<1xi8> + %26 = tensor.empty() : tensor<1x32x40x128xf32> + %27 = linalg.generic {indexing_maps = [#map5, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%inserted_slice_11, %expanded_6 : tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) outs(%26 : tensor<1x32x40x128xf32>) { + ^bb0(%in: f32, %in_19: f32, %out: f32): + %37 = arith.mulf %in, %in_19 : f32 + linalg.yield %37 : f32 + } -> tensor<1x32x40x128xf32> + %28 = tensor.empty() : tensor<1x32x40x128xf32> + %29 = linalg.generic {indexing_maps = [#map5, #map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%22, %27 : tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) outs(%28 : tensor<1x32x40x128xf32>) { + ^bb0(%in: f32, %in_19: f32, %out: f32): + %37 = arith.addf %in, %in_19 : f32 + linalg.yield %37 : f32 + } -> tensor<1x32x40x128xf32> + %30 = tensor.empty() : tensor<1x32x40x128xf32> + %31 = linalg.generic {indexing_maps = [#map5, #map6, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%transposed_1, %expanded_5 : tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>) outs(%30 : tensor<1x32x40x128xf32>) { + ^bb0(%in: f32, %in_19: f32, %out: f32): + %37 = arith.mulf %in, %in_19 : f32 + linalg.yield %37 : f32 + } -> tensor<1x32x40x128xf32> + %extracted_slice_13 = tensor.extract_slice %transposed_1[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> + %extracted_slice_14 = tensor.extract_slice %transposed_1[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> + %32 = tensor.empty() : tensor<1x32x40x64xf32> + %33 = linalg.generic {indexing_maps = [#map5, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice_14 : tensor<1x32x40x64xf32>) outs(%32 : tensor<1x32x40x64xf32>) { + ^bb0(%in: f32, %out: f32): + %37 = arith.negf %in : f32 + linalg.yield %37 : f32 + } -> tensor<1x32x40x64xf32> + %34 = tensor.empty() : tensor<1x32x40x128xf32> + %inserted_slice_15 = tensor.insert_slice %33 into %34[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> + %inserted_slice_16 = tensor.insert_slice %extracted_slice_13 into %inserted_slice_15[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> + %35 = call @rtclock() : () -> f64 + %36 = arith.subf %35, %0 : f64 + %cast = tensor.cast %inserted_slice_16 : tensor<1x32x40x128xf32> to tensor<*xf32> + %cast_17 = tensor.cast %29 : tensor<1x32x40x128xf32> to tensor<*xf32> + %cast_18 = tensor.cast %31 : tensor<1x32x40x128xf32> to tensor<*xf32> + call @printMemrefF32(%cast) : (tensor<*xf32>) -> () + call @printMemrefF32(%cast_17) : (tensor<*xf32>) -> () + call @printMemrefF32(%cast_18) : (tensor<*xf32>) -> () + vector.print %36 : f64 + return + } + func.func @main() { + %cst = arith.constant dense<2.000000e+00> : tensor<1x40x4096xf32> + %cst_0 = arith.constant dense<3.000000e+00> : tensor<1x40x4096xf32> + %cst_1 = arith.constant dense<4.000000e+00> : tensor<1x40x4096xf32> + %cst_2 = arith.constant dense<5.000000e+00> : tensor<1x1x2048x128xf32> + %cst_3 = arith.constant dense<6.000000e+00> : tensor<1x1x2048x128xf32> + %cst_4 = arith.constant dense<7> : tensor<1x40xi64> + call @kernel(%cst, %cst_0, %cst_1, %cst_2, %cst_3, %cst_4) : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>, tensor<1x40x4096xf32>, tensor<1x1x2048x128xf32>, tensor<1x1x2048x128xf32>, tensor<1x40xi64>) -> () + return + } + func.func private @printMemrefF32(tensor<*xf32>) +} + diff --git a/rope-b2fe557ff-llvm21rc2.ll b/rope-b2fe557ff-llvm21rc2.ll new file mode 100644 index 0000000000..4b8c2e9b63 --- /dev/null +++ b/rope-b2fe557ff-llvm21rc2.ll @@ -0,0 +1,1764 @@ +module { + llvm.func @memrefCopy(i64, !llvm.ptr, !llvm.ptr) + llvm.func @malloc(i64) -> !llvm.ptr + llvm.func @printNewline() + llvm.func @printF64(f64) + llvm.mlir.global private constant @__constant_1x40xi64(dense<7> : tensor<1x40xi64>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<1 x array<40 x i64>> + llvm.mlir.global private constant @__constant_1x1x2048x128xf32_2(dense<6.000000e+00> : tensor<1x1x2048x128xf32>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<1 x array<1 x array<2048 x array<128 x f32>>>> + llvm.mlir.global private constant @__constant_1x1x2048x128xf32(dense<5.000000e+00> : tensor<1x1x2048x128xf32>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<1 x array<1 x array<2048 x array<128 x f32>>>> + llvm.mlir.global private constant @__constant_1x40x4096xf32_1(dense<4.000000e+00> : tensor<1x40x4096xf32>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<1 x array<40 x array<4096 x f32>>> + llvm.mlir.global private constant @__constant_1x40x4096xf32_0(dense<3.000000e+00> : tensor<1x40x4096xf32>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<1 x array<40 x array<4096 x f32>>> + llvm.mlir.global private constant @__constant_1x40x4096xf32(dense<2.000000e+00> : tensor<1x40x4096xf32>) {addr_space = 0 : i32, alignment = 64 : i64} : !llvm.array<1 x array<40 x array<4096 x f32>>> + llvm.func @rtclock() -> f64 attributes {sym_visibility = "private"} + llvm.func @kernel(%arg0: !llvm.ptr, %arg1: !llvm.ptr, %arg2: i64, %arg3: i64, %arg4: i64, %arg5: i64, %arg6: i64, %arg7: i64, %arg8: i64, %arg9: !llvm.ptr, %arg10: !llvm.ptr, %arg11: i64, %arg12: i64, %arg13: i64, %arg14: i64, %arg15: i64, %arg16: i64, %arg17: i64, %arg18: !llvm.ptr, %arg19: !llvm.ptr, %arg20: i64, %arg21: i64, %arg22: i64, %arg23: i64, %arg24: i64, %arg25: i64, %arg26: i64, %arg27: !llvm.ptr, %arg28: !llvm.ptr, %arg29: i64, %arg30: i64, %arg31: i64, %arg32: i64, %arg33: i64, %arg34: i64, %arg35: i64, %arg36: i64, %arg37: i64, %arg38: !llvm.ptr, %arg39: !llvm.ptr, %arg40: i64, %arg41: i64, %arg42: i64, %arg43: i64, %arg44: i64, %arg45: i64, %arg46: i64, %arg47: i64, %arg48: i64, %arg49: !llvm.ptr, %arg50: !llvm.ptr, %arg51: i64, %arg52: i64, %arg53: i64, %arg54: i64, %arg55: i64) { + %0 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %1 = llvm.insertvalue %arg49, %0[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %2 = llvm.insertvalue %arg50, %1[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %3 = llvm.insertvalue %arg51, %2[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %4 = llvm.insertvalue %arg52, %3[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %5 = llvm.insertvalue %arg54, %4[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %6 = llvm.insertvalue %arg53, %5[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %7 = llvm.insertvalue %arg55, %6[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %8 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %9 = llvm.insertvalue %arg38, %8[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %10 = llvm.insertvalue %arg39, %9[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %11 = llvm.insertvalue %arg40, %10[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %12 = llvm.insertvalue %arg41, %11[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %13 = llvm.insertvalue %arg45, %12[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %14 = llvm.insertvalue %arg42, %13[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %15 = llvm.insertvalue %arg46, %14[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %16 = llvm.insertvalue %arg43, %15[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %17 = llvm.insertvalue %arg47, %16[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %18 = llvm.insertvalue %arg44, %17[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %19 = llvm.insertvalue %arg48, %18[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %20 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %21 = llvm.insertvalue %arg27, %20[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %22 = llvm.insertvalue %arg28, %21[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %23 = llvm.insertvalue %arg29, %22[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %24 = llvm.insertvalue %arg30, %23[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %25 = llvm.insertvalue %arg34, %24[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %26 = llvm.insertvalue %arg31, %25[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %27 = llvm.insertvalue %arg35, %26[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %28 = llvm.insertvalue %arg32, %27[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %29 = llvm.insertvalue %arg36, %28[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %30 = llvm.insertvalue %arg33, %29[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %31 = llvm.insertvalue %arg37, %30[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %32 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %33 = llvm.insertvalue %arg9, %32[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %34 = llvm.insertvalue %arg10, %33[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %35 = llvm.insertvalue %arg11, %34[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %36 = llvm.insertvalue %arg12, %35[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %37 = llvm.insertvalue %arg15, %36[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %38 = llvm.insertvalue %arg13, %37[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %39 = llvm.insertvalue %arg16, %38[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %40 = llvm.insertvalue %arg14, %39[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %41 = llvm.insertvalue %arg17, %40[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %42 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %43 = llvm.insertvalue %arg0, %42[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %44 = llvm.insertvalue %arg1, %43[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %45 = llvm.insertvalue %arg2, %44[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %46 = llvm.insertvalue %arg3, %45[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %47 = llvm.insertvalue %arg6, %46[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %48 = llvm.insertvalue %arg4, %47[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %49 = llvm.insertvalue %arg7, %48[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %50 = llvm.insertvalue %arg5, %49[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %51 = llvm.insertvalue %arg8, %50[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %52 = llvm.mlir.constant(0 : index) : i64 + %53 = llvm.mlir.constant(1 : index) : i64 + %54 = llvm.mlir.constant(1 : index) : i64 + %55 = llvm.mlir.constant(1 : index) : i64 + %56 = llvm.mlir.constant(1 : index) : i64 + %57 = llvm.mlir.zero : !llvm.ptr + %58 = llvm.getelementptr %57[1] : (!llvm.ptr) -> !llvm.ptr, f32 + %59 = llvm.ptrtoint %58 : !llvm.ptr to i64 + %60 = llvm.call @malloc(%59) : (i64) -> !llvm.ptr + %61 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %62 = llvm.insertvalue %60, %61[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %63 = llvm.insertvalue %60, %62[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %64 = llvm.mlir.constant(0 : index) : i64 + %65 = llvm.insertvalue %64, %63[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %66 = llvm.insertvalue %53, %65[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %67 = llvm.insertvalue %54, %66[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %68 = llvm.insertvalue %55, %67[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %69 = llvm.insertvalue %54, %68[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %70 = llvm.insertvalue %55, %69[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %71 = llvm.insertvalue %56, %70[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %72 = llvm.mlir.constant(1 : index) : i64 + %73 = llvm.mlir.constant(1 : index) : i64 + %74 = llvm.mlir.constant(1 : index) : i64 + %75 = llvm.mlir.constant(1 : index) : i64 + %76 = llvm.mlir.zero : !llvm.ptr + %77 = llvm.getelementptr %76[1] : (!llvm.ptr) -> !llvm.ptr, f32 + %78 = llvm.ptrtoint %77 : !llvm.ptr to i64 + %79 = llvm.call @malloc(%78) : (i64) -> !llvm.ptr + %80 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %81 = llvm.insertvalue %79, %80[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %82 = llvm.insertvalue %79, %81[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %83 = llvm.mlir.constant(0 : index) : i64 + %84 = llvm.insertvalue %83, %82[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %85 = llvm.insertvalue %72, %84[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %86 = llvm.insertvalue %73, %85[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %87 = llvm.insertvalue %74, %86[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %88 = llvm.insertvalue %73, %87[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %89 = llvm.insertvalue %74, %88[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %90 = llvm.insertvalue %75, %89[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %91 = llvm.mlir.constant(1 : index) : i64 + %92 = llvm.mlir.constant(1 : index) : i64 + %93 = llvm.mlir.constant(1 : index) : i64 + %94 = llvm.mlir.constant(1 : index) : i64 + %95 = llvm.mlir.constant(1 : index) : i64 + %96 = llvm.mlir.zero : !llvm.ptr + %97 = llvm.getelementptr %96[1] : (!llvm.ptr) -> !llvm.ptr, f32 + %98 = llvm.ptrtoint %97 : !llvm.ptr to i64 + %99 = llvm.call @malloc(%98) : (i64) -> !llvm.ptr + %100 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %101 = llvm.insertvalue %99, %100[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %102 = llvm.insertvalue %99, %101[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %103 = llvm.mlir.constant(0 : index) : i64 + %104 = llvm.insertvalue %103, %102[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %105 = llvm.insertvalue %91, %104[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %106 = llvm.insertvalue %92, %105[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %107 = llvm.insertvalue %93, %106[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %108 = llvm.insertvalue %94, %107[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %109 = llvm.insertvalue %92, %108[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %110 = llvm.insertvalue %93, %109[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %111 = llvm.insertvalue %94, %110[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %112 = llvm.insertvalue %95, %111[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %113 = llvm.mlir.constant(1 : index) : i64 + %114 = llvm.mlir.constant(1 : index) : i64 + %115 = llvm.mlir.constant(1 : index) : i64 + %116 = llvm.mlir.constant(1 : index) : i64 + %117 = llvm.mlir.constant(1 : index) : i64 + %118 = llvm.mlir.zero : !llvm.ptr + %119 = llvm.getelementptr %118[1] : (!llvm.ptr) -> !llvm.ptr, f32 + %120 = llvm.ptrtoint %119 : !llvm.ptr to i64 + %121 = llvm.call @malloc(%120) : (i64) -> !llvm.ptr + %122 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %123 = llvm.insertvalue %121, %122[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %124 = llvm.insertvalue %121, %123[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %125 = llvm.mlir.constant(0 : index) : i64 + %126 = llvm.insertvalue %125, %124[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %127 = llvm.insertvalue %113, %126[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %128 = llvm.insertvalue %114, %127[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %129 = llvm.insertvalue %115, %128[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %130 = llvm.insertvalue %116, %129[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %131 = llvm.insertvalue %114, %130[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %132 = llvm.insertvalue %115, %131[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %133 = llvm.insertvalue %116, %132[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %134 = llvm.insertvalue %117, %133[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %135 = llvm.call @rtclock() : () -> f64 + %136 = llvm.extractvalue %51[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %137 = llvm.extractvalue %51[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %138 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64)> + %139 = llvm.insertvalue %136, %138[0] : !llvm.struct<(ptr, ptr, i64)> + %140 = llvm.insertvalue %137, %139[1] : !llvm.struct<(ptr, ptr, i64)> + %141 = llvm.mlir.constant(0 : index) : i64 + %142 = llvm.insertvalue %141, %140[2] : !llvm.struct<(ptr, ptr, i64)> + %143 = llvm.extractvalue %51[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %144 = llvm.extractvalue %51[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %145 = llvm.extractvalue %51[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %146 = llvm.extractvalue %51[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %147 = llvm.extractvalue %51[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %148 = llvm.extractvalue %51[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %149 = llvm.extractvalue %51[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %150 = llvm.mlir.constant(128 : index) : i64 + %151 = llvm.mul %149, %150 overflow : i64 + %152 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %153 = llvm.insertvalue %136, %152[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %154 = llvm.insertvalue %137, %153[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %155 = llvm.insertvalue %143, %154[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %156 = llvm.mlir.constant(1 : index) : i64 + %157 = llvm.insertvalue %156, %155[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %158 = llvm.insertvalue %147, %157[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %159 = llvm.mlir.constant(40 : index) : i64 + %160 = llvm.insertvalue %159, %158[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %161 = llvm.insertvalue %148, %160[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %162 = llvm.mlir.constant(32 : index) : i64 + %163 = llvm.insertvalue %162, %161[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %164 = llvm.insertvalue %151, %163[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %165 = llvm.mlir.constant(128 : index) : i64 + %166 = llvm.insertvalue %165, %164[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %167 = llvm.insertvalue %149, %166[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %168 = llvm.mlir.constant(1 : index) : i64 + %169 = llvm.mlir.constant(32 : index) : i64 + %170 = llvm.mlir.constant(40 : index) : i64 + %171 = llvm.mlir.constant(128 : index) : i64 + %172 = llvm.mlir.constant(1 : index) : i64 + %173 = llvm.mlir.constant(5120 : index) : i64 + %174 = llvm.mlir.constant(163840 : index) : i64 + %175 = llvm.mlir.constant(163840 : index) : i64 + %176 = llvm.mlir.zero : !llvm.ptr + %177 = llvm.getelementptr %176[163840] : (!llvm.ptr) -> !llvm.ptr, f32 + %178 = llvm.ptrtoint %177 : !llvm.ptr to i64 + %179 = llvm.mlir.constant(64 : index) : i64 + %180 = llvm.add %178, %179 : i64 + %181 = llvm.call @malloc(%180) : (i64) -> !llvm.ptr + %182 = llvm.ptrtoint %181 : !llvm.ptr to i64 + %183 = llvm.mlir.constant(1 : index) : i64 + %184 = llvm.sub %179, %183 : i64 + %185 = llvm.add %182, %184 : i64 + %186 = llvm.urem %185, %179 : i64 + %187 = llvm.sub %185, %186 : i64 + %188 = llvm.inttoptr %187 : i64 to !llvm.ptr + %189 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %190 = llvm.insertvalue %181, %189[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %191 = llvm.insertvalue %188, %190[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %192 = llvm.mlir.constant(0 : index) : i64 + %193 = llvm.insertvalue %192, %191[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %194 = llvm.insertvalue %168, %193[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %195 = llvm.insertvalue %169, %194[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %196 = llvm.insertvalue %170, %195[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %197 = llvm.insertvalue %171, %196[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %198 = llvm.insertvalue %174, %197[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %199 = llvm.insertvalue %173, %198[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %200 = llvm.insertvalue %171, %199[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %201 = llvm.insertvalue %172, %200[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %202 = llvm.extractvalue %41[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %203 = llvm.extractvalue %41[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %204 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64)> + %205 = llvm.insertvalue %202, %204[0] : !llvm.struct<(ptr, ptr, i64)> + %206 = llvm.insertvalue %203, %205[1] : !llvm.struct<(ptr, ptr, i64)> + %207 = llvm.mlir.constant(0 : index) : i64 + %208 = llvm.insertvalue %207, %206[2] : !llvm.struct<(ptr, ptr, i64)> + %209 = llvm.extractvalue %41[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %210 = llvm.extractvalue %41[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %211 = llvm.extractvalue %41[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %212 = llvm.extractvalue %41[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %213 = llvm.extractvalue %41[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %214 = llvm.extractvalue %41[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %215 = llvm.extractvalue %41[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %216 = llvm.mlir.constant(128 : index) : i64 + %217 = llvm.mul %215, %216 overflow : i64 + %218 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %219 = llvm.insertvalue %202, %218[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %220 = llvm.insertvalue %203, %219[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %221 = llvm.insertvalue %209, %220[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %222 = llvm.mlir.constant(1 : index) : i64 + %223 = llvm.insertvalue %222, %221[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %224 = llvm.insertvalue %213, %223[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %225 = llvm.mlir.constant(40 : index) : i64 + %226 = llvm.insertvalue %225, %224[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %227 = llvm.insertvalue %214, %226[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %228 = llvm.mlir.constant(32 : index) : i64 + %229 = llvm.insertvalue %228, %227[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %230 = llvm.insertvalue %217, %229[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %231 = llvm.mlir.constant(128 : index) : i64 + %232 = llvm.insertvalue %231, %230[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %233 = llvm.insertvalue %215, %232[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %234 = llvm.mlir.constant(1 : index) : i64 + %235 = llvm.mlir.constant(32 : index) : i64 + %236 = llvm.mlir.constant(40 : index) : i64 + %237 = llvm.mlir.constant(128 : index) : i64 + %238 = llvm.mlir.constant(1 : index) : i64 + %239 = llvm.mlir.constant(5120 : index) : i64 + %240 = llvm.mlir.constant(163840 : index) : i64 + %241 = llvm.mlir.constant(163840 : index) : i64 + %242 = llvm.mlir.zero : !llvm.ptr + %243 = llvm.getelementptr %242[163840] : (!llvm.ptr) -> !llvm.ptr, f32 + %244 = llvm.ptrtoint %243 : !llvm.ptr to i64 + %245 = llvm.mlir.constant(64 : index) : i64 + %246 = llvm.add %244, %245 : i64 + %247 = llvm.call @malloc(%246) : (i64) -> !llvm.ptr + %248 = llvm.ptrtoint %247 : !llvm.ptr to i64 + %249 = llvm.mlir.constant(1 : index) : i64 + %250 = llvm.sub %245, %249 : i64 + %251 = llvm.add %248, %250 : i64 + %252 = llvm.urem %251, %245 : i64 + %253 = llvm.sub %251, %252 : i64 + %254 = llvm.inttoptr %253 : i64 to !llvm.ptr + %255 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %256 = llvm.insertvalue %247, %255[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %257 = llvm.insertvalue %254, %256[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %258 = llvm.mlir.constant(0 : index) : i64 + %259 = llvm.insertvalue %258, %257[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %260 = llvm.insertvalue %234, %259[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %261 = llvm.insertvalue %235, %260[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %262 = llvm.insertvalue %236, %261[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %263 = llvm.insertvalue %237, %262[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %264 = llvm.insertvalue %240, %263[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %265 = llvm.insertvalue %239, %264[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %266 = llvm.insertvalue %237, %265[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %267 = llvm.insertvalue %238, %266[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %268 = llvm.extractvalue %31[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %269 = llvm.extractvalue %31[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %270 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64)> + %271 = llvm.insertvalue %268, %270[0] : !llvm.struct<(ptr, ptr, i64)> + %272 = llvm.insertvalue %269, %271[1] : !llvm.struct<(ptr, ptr, i64)> + %273 = llvm.mlir.constant(0 : index) : i64 + %274 = llvm.insertvalue %273, %272[2] : !llvm.struct<(ptr, ptr, i64)> + %275 = llvm.extractvalue %31[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %276 = llvm.extractvalue %31[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %277 = llvm.extractvalue %31[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %278 = llvm.extractvalue %31[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %279 = llvm.extractvalue %31[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %280 = llvm.extractvalue %31[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %281 = llvm.extractvalue %31[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %282 = llvm.extractvalue %31[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %283 = llvm.extractvalue %31[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %284 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %285 = llvm.insertvalue %268, %284[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %286 = llvm.insertvalue %269, %285[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %287 = llvm.insertvalue %275, %286[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %288 = llvm.mlir.constant(1 : index) : i64 + %289 = llvm.insertvalue %288, %287[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %290 = llvm.insertvalue %280, %289[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %291 = llvm.mlir.constant(1 : index) : i64 + %292 = llvm.insertvalue %291, %290[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %293 = llvm.insertvalue %281, %292[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %294 = llvm.mlir.constant(40 : index) : i64 + %295 = llvm.insertvalue %294, %293[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %296 = llvm.insertvalue %282, %295[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %297 = llvm.mlir.constant(128 : index) : i64 + %298 = llvm.insertvalue %297, %296[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %299 = llvm.insertvalue %283, %298[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %300 = llvm.extractvalue %19[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %301 = llvm.extractvalue %19[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %302 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64)> + %303 = llvm.insertvalue %300, %302[0] : !llvm.struct<(ptr, ptr, i64)> + %304 = llvm.insertvalue %301, %303[1] : !llvm.struct<(ptr, ptr, i64)> + %305 = llvm.mlir.constant(0 : index) : i64 + %306 = llvm.insertvalue %305, %304[2] : !llvm.struct<(ptr, ptr, i64)> + %307 = llvm.extractvalue %19[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %308 = llvm.extractvalue %19[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %309 = llvm.extractvalue %19[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %310 = llvm.extractvalue %19[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %311 = llvm.extractvalue %19[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %312 = llvm.extractvalue %19[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %313 = llvm.extractvalue %19[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %314 = llvm.extractvalue %19[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %315 = llvm.extractvalue %19[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %316 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %317 = llvm.insertvalue %300, %316[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %318 = llvm.insertvalue %301, %317[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %319 = llvm.insertvalue %307, %318[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %320 = llvm.mlir.constant(1 : index) : i64 + %321 = llvm.insertvalue %320, %319[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %322 = llvm.insertvalue %312, %321[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %323 = llvm.mlir.constant(1 : index) : i64 + %324 = llvm.insertvalue %323, %322[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %325 = llvm.insertvalue %313, %324[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %326 = llvm.mlir.constant(40 : index) : i64 + %327 = llvm.insertvalue %326, %325[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %328 = llvm.insertvalue %314, %327[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %329 = llvm.mlir.constant(128 : index) : i64 + %330 = llvm.insertvalue %329, %328[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %331 = llvm.insertvalue %315, %330[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %332 = llvm.mlir.constant(40 : index) : i64 + %333 = llvm.mlir.constant(128 : index) : i64 + %334 = llvm.mlir.constant(1 : index) : i64 + %335 = llvm.mlir.constant(5120 : index) : i64 + %336 = llvm.mlir.zero : !llvm.ptr + %337 = llvm.getelementptr %336[5120] : (!llvm.ptr) -> !llvm.ptr, f32 + %338 = llvm.ptrtoint %337 : !llvm.ptr to i64 + %339 = llvm.mlir.constant(64 : index) : i64 + %340 = llvm.add %338, %339 : i64 + %341 = llvm.call @malloc(%340) : (i64) -> !llvm.ptr + %342 = llvm.ptrtoint %341 : !llvm.ptr to i64 + %343 = llvm.mlir.constant(1 : index) : i64 + %344 = llvm.sub %339, %343 : i64 + %345 = llvm.add %342, %344 : i64 + %346 = llvm.urem %345, %339 : i64 + %347 = llvm.sub %345, %346 : i64 + %348 = llvm.inttoptr %347 : i64 to !llvm.ptr + %349 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %350 = llvm.insertvalue %341, %349[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %351 = llvm.insertvalue %348, %350[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %352 = llvm.mlir.constant(0 : index) : i64 + %353 = llvm.insertvalue %352, %351[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %354 = llvm.insertvalue %332, %353[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %355 = llvm.insertvalue %333, %354[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %356 = llvm.insertvalue %333, %355[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %357 = llvm.insertvalue %334, %356[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %358 = llvm.mlir.constant(0 : index) : i64 + %359 = llvm.mlir.constant(40 : index) : i64 + %360 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb1(%358 : i64) + ^bb1(%361: i64): // 2 preds: ^bb0, ^bb5 + %362 = llvm.icmp "slt" %361, %359 : i64 + llvm.cond_br %362, ^bb2, ^bb6 + ^bb2: // pred: ^bb1 + %363 = llvm.mlir.constant(0 : index) : i64 + %364 = llvm.mlir.constant(128 : index) : i64 + %365 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb3(%363 : i64) + ^bb3(%366: i64): // 2 preds: ^bb2, ^bb4 + %367 = llvm.icmp "slt" %366, %364 : i64 + llvm.cond_br %367, ^bb4, ^bb5 + ^bb4: // pred: ^bb3 + %368 = llvm.getelementptr %269[%275] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %369 = llvm.mul %280, %52 overflow : i64 + %370 = llvm.mul %281, %52 overflow : i64 + %371 = llvm.add %369, %370 overflow : i64 + %372 = llvm.mul %361, %282 overflow : i64 + %373 = llvm.add %371, %372 overflow : i64 + %374 = llvm.mul %366, %283 overflow : i64 + %375 = llvm.add %373, %374 overflow : i64 + %376 = llvm.getelementptr inbounds|nuw %368[%375] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %377 = llvm.load %376 : !llvm.ptr -> f32 + %378 = llvm.mlir.constant(0 : index) : i64 + %379 = llvm.mlir.constant(0 : index) : i64 + %380 = llvm.mlir.constant(0 : index) : i64 + %381 = llvm.add %378, %379 overflow : i64 + %382 = llvm.add %381, %380 overflow : i64 + %383 = llvm.getelementptr inbounds|nuw %60[%382] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %377, %383 : f32, !llvm.ptr + %384 = llvm.mlir.constant(0 : index) : i64 + %385 = llvm.mlir.constant(0 : index) : i64 + %386 = llvm.mlir.constant(0 : index) : i64 + %387 = llvm.add %384, %385 overflow : i64 + %388 = llvm.add %387, %386 overflow : i64 + %389 = llvm.getelementptr inbounds|nuw %60[%388] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %390 = llvm.load %389 : !llvm.ptr -> f32 + %391 = llvm.mlir.constant(128 : index) : i64 + %392 = llvm.mul %361, %391 overflow : i64 + %393 = llvm.add %392, %366 overflow : i64 + %394 = llvm.getelementptr inbounds|nuw %348[%393] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %390, %394 : f32, !llvm.ptr + %395 = llvm.add %366, %365 : i64 + llvm.br ^bb3(%395 : i64) + ^bb5: // pred: ^bb3 + %396 = llvm.add %361, %360 : i64 + llvm.br ^bb1(%396 : i64) + ^bb6: // pred: ^bb1 + %397 = llvm.mlir.constant(40 : index) : i64 + %398 = llvm.mlir.constant(128 : index) : i64 + %399 = llvm.mlir.constant(1 : index) : i64 + %400 = llvm.mlir.constant(5120 : index) : i64 + %401 = llvm.mlir.zero : !llvm.ptr + %402 = llvm.getelementptr %401[5120] : (!llvm.ptr) -> !llvm.ptr, f32 + %403 = llvm.ptrtoint %402 : !llvm.ptr to i64 + %404 = llvm.mlir.constant(64 : index) : i64 + %405 = llvm.add %403, %404 : i64 + %406 = llvm.call @malloc(%405) : (i64) -> !llvm.ptr + %407 = llvm.ptrtoint %406 : !llvm.ptr to i64 + %408 = llvm.mlir.constant(1 : index) : i64 + %409 = llvm.sub %404, %408 : i64 + %410 = llvm.add %407, %409 : i64 + %411 = llvm.urem %410, %404 : i64 + %412 = llvm.sub %410, %411 : i64 + %413 = llvm.inttoptr %412 : i64 to !llvm.ptr + %414 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %415 = llvm.insertvalue %406, %414[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %416 = llvm.insertvalue %413, %415[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %417 = llvm.mlir.constant(0 : index) : i64 + %418 = llvm.insertvalue %417, %416[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %419 = llvm.insertvalue %397, %418[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %420 = llvm.insertvalue %398, %419[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %421 = llvm.insertvalue %398, %420[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %422 = llvm.insertvalue %399, %421[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %423 = llvm.mlir.constant(0 : index) : i64 + %424 = llvm.mlir.constant(40 : index) : i64 + %425 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb7(%423 : i64) + ^bb7(%426: i64): // 2 preds: ^bb6, ^bb11 + %427 = llvm.icmp "slt" %426, %424 : i64 + llvm.cond_br %427, ^bb8, ^bb12 + ^bb8: // pred: ^bb7 + %428 = llvm.mlir.constant(0 : index) : i64 + %429 = llvm.mlir.constant(128 : index) : i64 + %430 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb9(%428 : i64) + ^bb9(%431: i64): // 2 preds: ^bb8, ^bb10 + %432 = llvm.icmp "slt" %431, %429 : i64 + llvm.cond_br %432, ^bb10, ^bb11 + ^bb10: // pred: ^bb9 + %433 = llvm.getelementptr %301[%307] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %434 = llvm.mul %312, %52 overflow : i64 + %435 = llvm.mul %313, %52 overflow : i64 + %436 = llvm.add %434, %435 overflow : i64 + %437 = llvm.mul %426, %314 overflow : i64 + %438 = llvm.add %436, %437 overflow : i64 + %439 = llvm.mul %431, %315 overflow : i64 + %440 = llvm.add %438, %439 overflow : i64 + %441 = llvm.getelementptr inbounds|nuw %433[%440] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %442 = llvm.load %441 : !llvm.ptr -> f32 + %443 = llvm.mlir.constant(0 : index) : i64 + %444 = llvm.mlir.constant(0 : index) : i64 + %445 = llvm.mlir.constant(0 : index) : i64 + %446 = llvm.add %443, %444 overflow : i64 + %447 = llvm.add %446, %445 overflow : i64 + %448 = llvm.getelementptr inbounds|nuw %79[%447] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %442, %448 : f32, !llvm.ptr + %449 = llvm.mlir.constant(0 : index) : i64 + %450 = llvm.mlir.constant(0 : index) : i64 + %451 = llvm.mlir.constant(0 : index) : i64 + %452 = llvm.add %449, %450 overflow : i64 + %453 = llvm.add %452, %451 overflow : i64 + %454 = llvm.getelementptr inbounds|nuw %79[%453] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %455 = llvm.load %454 : !llvm.ptr -> f32 + %456 = llvm.mlir.constant(128 : index) : i64 + %457 = llvm.mul %426, %456 overflow : i64 + %458 = llvm.add %457, %431 overflow : i64 + %459 = llvm.getelementptr inbounds|nuw %413[%458] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %455, %459 : f32, !llvm.ptr + %460 = llvm.add %431, %430 : i64 + llvm.br ^bb9(%460 : i64) + ^bb11: // pred: ^bb9 + %461 = llvm.add %426, %425 : i64 + llvm.br ^bb7(%461 : i64) + ^bb12: // pred: ^bb7 + %462 = llvm.mlir.constant(1 : index) : i64 + %463 = llvm.mlir.constant(40 : index) : i64 + %464 = llvm.mlir.constant(128 : index) : i64 + %465 = llvm.mlir.constant(1 : index) : i64 + %466 = llvm.mlir.constant(5120 : index) : i64 + %467 = llvm.mlir.constant(5120 : index) : i64 + %468 = llvm.mlir.zero : !llvm.ptr + %469 = llvm.getelementptr %468[5120] : (!llvm.ptr) -> !llvm.ptr, f32 + %470 = llvm.ptrtoint %469 : !llvm.ptr to i64 + %471 = llvm.mlir.constant(64 : index) : i64 + %472 = llvm.add %470, %471 : i64 + %473 = llvm.call @malloc(%472) : (i64) -> !llvm.ptr + %474 = llvm.ptrtoint %473 : !llvm.ptr to i64 + %475 = llvm.mlir.constant(1 : index) : i64 + %476 = llvm.sub %471, %475 : i64 + %477 = llvm.add %474, %476 : i64 + %478 = llvm.urem %477, %471 : i64 + %479 = llvm.sub %477, %478 : i64 + %480 = llvm.inttoptr %479 : i64 to !llvm.ptr + %481 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %482 = llvm.insertvalue %473, %481[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %483 = llvm.insertvalue %480, %482[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %484 = llvm.mlir.constant(0 : index) : i64 + %485 = llvm.insertvalue %484, %483[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %486 = llvm.insertvalue %462, %485[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %487 = llvm.insertvalue %463, %486[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %488 = llvm.insertvalue %464, %487[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %489 = llvm.insertvalue %466, %488[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %490 = llvm.insertvalue %464, %489[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %491 = llvm.insertvalue %465, %490[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %492 = llvm.mlir.constant(0 : index) : i64 + %493 = llvm.mlir.constant(1 : index) : i64 + %494 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb13(%492 : i64) + ^bb13(%495: i64): // 2 preds: ^bb12, ^bb20 + %496 = llvm.icmp "slt" %495, %493 : i64 + llvm.cond_br %496, ^bb14, ^bb21 + ^bb14: // pred: ^bb13 + %497 = llvm.mlir.constant(0 : index) : i64 + %498 = llvm.mlir.constant(40 : index) : i64 + %499 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb15(%497 : i64) + ^bb15(%500: i64): // 2 preds: ^bb14, ^bb19 + %501 = llvm.icmp "slt" %500, %498 : i64 + llvm.cond_br %501, ^bb16, ^bb20 + ^bb16: // pred: ^bb15 + %502 = llvm.mlir.constant(0 : index) : i64 + %503 = llvm.mlir.constant(128 : index) : i64 + %504 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb17(%502 : i64) + ^bb17(%505: i64): // 2 preds: ^bb16, ^bb18 + %506 = llvm.icmp "slt" %505, %503 : i64 + llvm.cond_br %506, ^bb18, ^bb19 + ^bb18: // pred: ^bb17 + %507 = llvm.extractvalue %7[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %508 = llvm.extractvalue %7[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %509 = llvm.getelementptr %507[%508] : (!llvm.ptr, i64) -> !llvm.ptr, i64 + %510 = llvm.extractvalue %7[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %511 = llvm.mul %495, %510 overflow : i64 + %512 = llvm.extractvalue %7[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %513 = llvm.mul %500, %512 overflow : i64 + %514 = llvm.add %511, %513 overflow : i64 + %515 = llvm.getelementptr inbounds|nuw %509[%514] : (!llvm.ptr, i64) -> !llvm.ptr, i64 + %516 = llvm.load %515 : !llvm.ptr -> i64 + %517 = llvm.mlir.constant(128 : index) : i64 + %518 = llvm.mul %516, %517 overflow : i64 + %519 = llvm.add %518, %505 overflow : i64 + %520 = llvm.getelementptr inbounds|nuw %348[%519] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %521 = llvm.load %520 : !llvm.ptr -> f32 + %522 = llvm.mlir.constant(5120 : index) : i64 + %523 = llvm.mul %495, %522 overflow : i64 + %524 = llvm.mlir.constant(128 : index) : i64 + %525 = llvm.mul %500, %524 overflow : i64 + %526 = llvm.add %523, %525 overflow : i64 + %527 = llvm.add %526, %505 overflow : i64 + %528 = llvm.getelementptr inbounds|nuw %480[%527] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %521, %528 : f32, !llvm.ptr + %529 = llvm.add %505, %504 : i64 + llvm.br ^bb17(%529 : i64) + ^bb19: // pred: ^bb17 + %530 = llvm.add %500, %499 : i64 + llvm.br ^bb15(%530 : i64) + ^bb20: // pred: ^bb15 + %531 = llvm.add %495, %494 : i64 + llvm.br ^bb13(%531 : i64) + ^bb21: // pred: ^bb13 + %532 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %533 = llvm.insertvalue %473, %532[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %534 = llvm.insertvalue %480, %533[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %535 = llvm.mlir.constant(0 : index) : i64 + %536 = llvm.insertvalue %535, %534[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %537 = llvm.mlir.constant(1 : index) : i64 + %538 = llvm.insertvalue %537, %536[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %539 = llvm.mlir.constant(5120 : index) : i64 + %540 = llvm.insertvalue %539, %538[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %541 = llvm.mlir.constant(1 : index) : i64 + %542 = llvm.insertvalue %541, %540[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %543 = llvm.mlir.constant(5120 : index) : i64 + %544 = llvm.insertvalue %543, %542[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %545 = llvm.mlir.constant(40 : index) : i64 + %546 = llvm.insertvalue %545, %544[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %547 = llvm.mlir.constant(128 : index) : i64 + %548 = llvm.insertvalue %547, %546[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %549 = llvm.mlir.constant(128 : index) : i64 + %550 = llvm.insertvalue %549, %548[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %551 = llvm.mlir.constant(1 : index) : i64 + %552 = llvm.insertvalue %551, %550[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %553 = llvm.mlir.constant(1 : index) : i64 + %554 = llvm.mlir.constant(40 : index) : i64 + %555 = llvm.mlir.constant(128 : index) : i64 + %556 = llvm.mlir.constant(1 : index) : i64 + %557 = llvm.mlir.constant(5120 : index) : i64 + %558 = llvm.mlir.constant(5120 : index) : i64 + %559 = llvm.mlir.zero : !llvm.ptr + %560 = llvm.getelementptr %559[5120] : (!llvm.ptr) -> !llvm.ptr, f32 + %561 = llvm.ptrtoint %560 : !llvm.ptr to i64 + %562 = llvm.mlir.constant(64 : index) : i64 + %563 = llvm.add %561, %562 : i64 + %564 = llvm.call @malloc(%563) : (i64) -> !llvm.ptr + %565 = llvm.ptrtoint %564 : !llvm.ptr to i64 + %566 = llvm.mlir.constant(1 : index) : i64 + %567 = llvm.sub %562, %566 : i64 + %568 = llvm.add %565, %567 : i64 + %569 = llvm.urem %568, %562 : i64 + %570 = llvm.sub %568, %569 : i64 + %571 = llvm.inttoptr %570 : i64 to !llvm.ptr + %572 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %573 = llvm.insertvalue %564, %572[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %574 = llvm.insertvalue %571, %573[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %575 = llvm.mlir.constant(0 : index) : i64 + %576 = llvm.insertvalue %575, %574[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %577 = llvm.insertvalue %553, %576[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %578 = llvm.insertvalue %554, %577[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %579 = llvm.insertvalue %555, %578[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %580 = llvm.insertvalue %557, %579[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %581 = llvm.insertvalue %555, %580[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %582 = llvm.insertvalue %556, %581[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %583 = llvm.mlir.constant(0 : index) : i64 + %584 = llvm.mlir.constant(1 : index) : i64 + %585 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb22(%583 : i64) + ^bb22(%586: i64): // 2 preds: ^bb21, ^bb29 + %587 = llvm.icmp "slt" %586, %584 : i64 + llvm.cond_br %587, ^bb23, ^bb30 + ^bb23: // pred: ^bb22 + %588 = llvm.mlir.constant(0 : index) : i64 + %589 = llvm.mlir.constant(40 : index) : i64 + %590 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb24(%588 : i64) + ^bb24(%591: i64): // 2 preds: ^bb23, ^bb28 + %592 = llvm.icmp "slt" %591, %589 : i64 + llvm.cond_br %592, ^bb25, ^bb29 + ^bb25: // pred: ^bb24 + %593 = llvm.mlir.constant(0 : index) : i64 + %594 = llvm.mlir.constant(128 : index) : i64 + %595 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb26(%593 : i64) + ^bb26(%596: i64): // 2 preds: ^bb25, ^bb27 + %597 = llvm.icmp "slt" %596, %594 : i64 + llvm.cond_br %597, ^bb27, ^bb28 + ^bb27: // pred: ^bb26 + %598 = llvm.extractvalue %7[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %599 = llvm.extractvalue %7[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %600 = llvm.getelementptr %598[%599] : (!llvm.ptr, i64) -> !llvm.ptr, i64 + %601 = llvm.extractvalue %7[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %602 = llvm.mul %586, %601 overflow : i64 + %603 = llvm.extractvalue %7[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %604 = llvm.mul %591, %603 overflow : i64 + %605 = llvm.add %602, %604 overflow : i64 + %606 = llvm.getelementptr inbounds|nuw %600[%605] : (!llvm.ptr, i64) -> !llvm.ptr, i64 + %607 = llvm.load %606 : !llvm.ptr -> i64 + %608 = llvm.mlir.constant(128 : index) : i64 + %609 = llvm.mul %607, %608 overflow : i64 + %610 = llvm.add %609, %596 overflow : i64 + %611 = llvm.getelementptr inbounds|nuw %413[%610] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %612 = llvm.load %611 : !llvm.ptr -> f32 + %613 = llvm.mlir.constant(5120 : index) : i64 + %614 = llvm.mul %586, %613 overflow : i64 + %615 = llvm.mlir.constant(128 : index) : i64 + %616 = llvm.mul %591, %615 overflow : i64 + %617 = llvm.add %614, %616 overflow : i64 + %618 = llvm.add %617, %596 overflow : i64 + %619 = llvm.getelementptr inbounds|nuw %571[%618] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %612, %619 : f32, !llvm.ptr + %620 = llvm.add %596, %595 : i64 + llvm.br ^bb26(%620 : i64) + ^bb28: // pred: ^bb26 + %621 = llvm.add %591, %590 : i64 + llvm.br ^bb24(%621 : i64) + ^bb29: // pred: ^bb24 + %622 = llvm.add %586, %585 : i64 + llvm.br ^bb22(%622 : i64) + ^bb30: // pred: ^bb22 + %623 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %624 = llvm.insertvalue %564, %623[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %625 = llvm.insertvalue %571, %624[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %626 = llvm.mlir.constant(0 : index) : i64 + %627 = llvm.insertvalue %626, %625[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %628 = llvm.mlir.constant(1 : index) : i64 + %629 = llvm.insertvalue %628, %627[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %630 = llvm.mlir.constant(5120 : index) : i64 + %631 = llvm.insertvalue %630, %629[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %632 = llvm.mlir.constant(1 : index) : i64 + %633 = llvm.insertvalue %632, %631[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %634 = llvm.mlir.constant(5120 : index) : i64 + %635 = llvm.insertvalue %634, %633[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %636 = llvm.mlir.constant(40 : index) : i64 + %637 = llvm.insertvalue %636, %635[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %638 = llvm.mlir.constant(128 : index) : i64 + %639 = llvm.insertvalue %638, %637[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %640 = llvm.mlir.constant(128 : index) : i64 + %641 = llvm.insertvalue %640, %639[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %642 = llvm.mlir.constant(1 : index) : i64 + %643 = llvm.insertvalue %642, %641[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %644 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %645 = llvm.insertvalue %181, %644[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %646 = llvm.insertvalue %188, %645[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %647 = llvm.mlir.constant(0 : index) : i64 + %648 = llvm.insertvalue %647, %646[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %649 = llvm.mlir.constant(1 : index) : i64 + %650 = llvm.insertvalue %649, %648[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %651 = llvm.mlir.constant(163840 : index) : i64 + %652 = llvm.insertvalue %651, %650[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %653 = llvm.mlir.constant(32 : index) : i64 + %654 = llvm.insertvalue %653, %652[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %655 = llvm.mlir.constant(5120 : index) : i64 + %656 = llvm.insertvalue %655, %654[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %657 = llvm.mlir.constant(40 : index) : i64 + %658 = llvm.insertvalue %657, %656[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %659 = llvm.mlir.constant(128 : index) : i64 + %660 = llvm.insertvalue %659, %658[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %661 = llvm.mlir.constant(64 : index) : i64 + %662 = llvm.insertvalue %661, %660[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %663 = llvm.mlir.constant(1 : index) : i64 + %664 = llvm.insertvalue %663, %662[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %665 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %666 = llvm.insertvalue %181, %665[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %667 = llvm.insertvalue %188, %666[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %668 = llvm.mlir.constant(64 : index) : i64 + %669 = llvm.insertvalue %668, %667[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %670 = llvm.mlir.constant(1 : index) : i64 + %671 = llvm.insertvalue %670, %669[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %672 = llvm.mlir.constant(163840 : index) : i64 + %673 = llvm.insertvalue %672, %671[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %674 = llvm.mlir.constant(32 : index) : i64 + %675 = llvm.insertvalue %674, %673[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %676 = llvm.mlir.constant(5120 : index) : i64 + %677 = llvm.insertvalue %676, %675[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %678 = llvm.mlir.constant(40 : index) : i64 + %679 = llvm.insertvalue %678, %677[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %680 = llvm.mlir.constant(128 : index) : i64 + %681 = llvm.insertvalue %680, %679[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %682 = llvm.mlir.constant(64 : index) : i64 + %683 = llvm.insertvalue %682, %681[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %684 = llvm.mlir.constant(1 : index) : i64 + %685 = llvm.insertvalue %684, %683[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %686 = llvm.mlir.constant(1 : index) : i64 + %687 = llvm.mlir.constant(32 : index) : i64 + %688 = llvm.mlir.constant(40 : index) : i64 + %689 = llvm.mlir.constant(64 : index) : i64 + %690 = llvm.mlir.constant(1 : index) : i64 + %691 = llvm.mlir.constant(2560 : index) : i64 + %692 = llvm.mlir.constant(81920 : index) : i64 + %693 = llvm.mlir.constant(81920 : index) : i64 + %694 = llvm.mlir.zero : !llvm.ptr + %695 = llvm.getelementptr %694[81920] : (!llvm.ptr) -> !llvm.ptr, f32 + %696 = llvm.ptrtoint %695 : !llvm.ptr to i64 + %697 = llvm.mlir.constant(64 : index) : i64 + %698 = llvm.add %696, %697 : i64 + %699 = llvm.call @malloc(%698) : (i64) -> !llvm.ptr + %700 = llvm.ptrtoint %699 : !llvm.ptr to i64 + %701 = llvm.mlir.constant(1 : index) : i64 + %702 = llvm.sub %697, %701 : i64 + %703 = llvm.add %700, %702 : i64 + %704 = llvm.urem %703, %697 : i64 + %705 = llvm.sub %703, %704 : i64 + %706 = llvm.inttoptr %705 : i64 to !llvm.ptr + %707 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %708 = llvm.insertvalue %699, %707[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %709 = llvm.insertvalue %706, %708[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %710 = llvm.mlir.constant(0 : index) : i64 + %711 = llvm.insertvalue %710, %709[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %712 = llvm.insertvalue %686, %711[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %713 = llvm.insertvalue %687, %712[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %714 = llvm.insertvalue %688, %713[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %715 = llvm.insertvalue %689, %714[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %716 = llvm.insertvalue %692, %715[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %717 = llvm.insertvalue %691, %716[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %718 = llvm.insertvalue %689, %717[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %719 = llvm.insertvalue %690, %718[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %720 = llvm.mlir.constant(0 : index) : i64 + %721 = llvm.mlir.constant(1 : index) : i64 + %722 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb31(%720 : i64) + ^bb31(%723: i64): // 2 preds: ^bb30, ^bb41 + %724 = llvm.icmp "slt" %723, %721 : i64 + llvm.cond_br %724, ^bb32, ^bb42 + ^bb32: // pred: ^bb31 + %725 = llvm.mlir.constant(0 : index) : i64 + %726 = llvm.mlir.constant(32 : index) : i64 + %727 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb33(%725 : i64) + ^bb33(%728: i64): // 2 preds: ^bb32, ^bb40 + %729 = llvm.icmp "slt" %728, %726 : i64 + llvm.cond_br %729, ^bb34, ^bb41 + ^bb34: // pred: ^bb33 + %730 = llvm.mlir.constant(0 : index) : i64 + %731 = llvm.mlir.constant(40 : index) : i64 + %732 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb35(%730 : i64) + ^bb35(%733: i64): // 2 preds: ^bb34, ^bb39 + %734 = llvm.icmp "slt" %733, %731 : i64 + llvm.cond_br %734, ^bb36, ^bb40 + ^bb36: // pred: ^bb35 + %735 = llvm.mlir.constant(0 : index) : i64 + %736 = llvm.mlir.constant(64 : index) : i64 + %737 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb37(%735 : i64) + ^bb37(%738: i64): // 2 preds: ^bb36, ^bb38 + %739 = llvm.icmp "slt" %738, %736 : i64 + llvm.cond_br %739, ^bb38, ^bb39 + ^bb38: // pred: ^bb37 + %740 = llvm.mlir.constant(64 : index) : i64 + %741 = llvm.getelementptr %188[64] : (!llvm.ptr) -> !llvm.ptr, f32 + %742 = llvm.mlir.constant(163840 : index) : i64 + %743 = llvm.mul %723, %742 overflow : i64 + %744 = llvm.mlir.constant(5120 : index) : i64 + %745 = llvm.mul %728, %744 overflow : i64 + %746 = llvm.add %743, %745 overflow : i64 + %747 = llvm.mlir.constant(128 : index) : i64 + %748 = llvm.mul %733, %747 overflow : i64 + %749 = llvm.add %746, %748 overflow : i64 + %750 = llvm.add %749, %738 overflow : i64 + %751 = llvm.getelementptr inbounds|nuw %741[%750] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %752 = llvm.load %751 : !llvm.ptr -> f32 + %753 = llvm.fneg %752 : f32 + %754 = llvm.mlir.constant(81920 : index) : i64 + %755 = llvm.mul %723, %754 overflow : i64 + %756 = llvm.mlir.constant(2560 : index) : i64 + %757 = llvm.mul %728, %756 overflow : i64 + %758 = llvm.add %755, %757 overflow : i64 + %759 = llvm.mlir.constant(64 : index) : i64 + %760 = llvm.mul %733, %759 overflow : i64 + %761 = llvm.add %758, %760 overflow : i64 + %762 = llvm.add %761, %738 overflow : i64 + %763 = llvm.getelementptr inbounds|nuw %706[%762] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %753, %763 : f32, !llvm.ptr + %764 = llvm.add %738, %737 : i64 + llvm.br ^bb37(%764 : i64) + ^bb39: // pred: ^bb37 + %765 = llvm.add %733, %732 : i64 + llvm.br ^bb35(%765 : i64) + ^bb40: // pred: ^bb35 + %766 = llvm.add %728, %727 : i64 + llvm.br ^bb33(%766 : i64) + ^bb41: // pred: ^bb33 + %767 = llvm.add %723, %722 : i64 + llvm.br ^bb31(%767 : i64) + ^bb42: // pred: ^bb31 + %768 = llvm.mlir.constant(1 : index) : i64 + %769 = llvm.mlir.constant(32 : index) : i64 + %770 = llvm.mlir.constant(40 : index) : i64 + %771 = llvm.mlir.constant(128 : index) : i64 + %772 = llvm.mlir.constant(1 : index) : i64 + %773 = llvm.mlir.constant(5120 : index) : i64 + %774 = llvm.mlir.constant(163840 : index) : i64 + %775 = llvm.mlir.constant(163840 : index) : i64 + %776 = llvm.mlir.zero : !llvm.ptr + %777 = llvm.getelementptr %776[163840] : (!llvm.ptr) -> !llvm.ptr, f32 + %778 = llvm.ptrtoint %777 : !llvm.ptr to i64 + %779 = llvm.mlir.constant(64 : index) : i64 + %780 = llvm.add %778, %779 : i64 + %781 = llvm.call @malloc(%780) : (i64) -> !llvm.ptr + %782 = llvm.ptrtoint %781 : !llvm.ptr to i64 + %783 = llvm.mlir.constant(1 : index) : i64 + %784 = llvm.sub %779, %783 : i64 + %785 = llvm.add %782, %784 : i64 + %786 = llvm.urem %785, %779 : i64 + %787 = llvm.sub %785, %786 : i64 + %788 = llvm.inttoptr %787 : i64 to !llvm.ptr + %789 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %790 = llvm.insertvalue %781, %789[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %791 = llvm.insertvalue %788, %790[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %792 = llvm.mlir.constant(0 : index) : i64 + %793 = llvm.insertvalue %792, %791[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %794 = llvm.insertvalue %768, %793[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %795 = llvm.insertvalue %769, %794[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %796 = llvm.insertvalue %770, %795[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %797 = llvm.insertvalue %771, %796[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %798 = llvm.insertvalue %774, %797[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %799 = llvm.insertvalue %773, %798[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %800 = llvm.insertvalue %771, %799[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %801 = llvm.insertvalue %772, %800[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %802 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %803 = llvm.insertvalue %781, %802[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %804 = llvm.insertvalue %788, %803[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %805 = llvm.mlir.constant(0 : index) : i64 + %806 = llvm.insertvalue %805, %804[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %807 = llvm.mlir.constant(1 : index) : i64 + %808 = llvm.insertvalue %807, %806[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %809 = llvm.mlir.constant(163840 : index) : i64 + %810 = llvm.insertvalue %809, %808[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %811 = llvm.mlir.constant(32 : index) : i64 + %812 = llvm.insertvalue %811, %810[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %813 = llvm.mlir.constant(5120 : index) : i64 + %814 = llvm.insertvalue %813, %812[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %815 = llvm.mlir.constant(40 : index) : i64 + %816 = llvm.insertvalue %815, %814[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %817 = llvm.mlir.constant(128 : index) : i64 + %818 = llvm.insertvalue %817, %816[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %819 = llvm.mlir.constant(64 : index) : i64 + %820 = llvm.insertvalue %819, %818[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %821 = llvm.mlir.constant(1 : index) : i64 + %822 = llvm.insertvalue %821, %820[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %823 = llvm.intr.stacksave : !llvm.ptr + %824 = llvm.mlir.constant(4 : i64) : i64 + %825 = llvm.mlir.constant(1 : index) : i64 + %826 = llvm.alloca %825 x !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> : (i64) -> !llvm.ptr + llvm.store %719, %826 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>, !llvm.ptr + %827 = llvm.mlir.poison : !llvm.struct<(i64, ptr)> + %828 = llvm.insertvalue %824, %827[0] : !llvm.struct<(i64, ptr)> + %829 = llvm.insertvalue %826, %828[1] : !llvm.struct<(i64, ptr)> + %830 = llvm.mlir.constant(4 : i64) : i64 + %831 = llvm.mlir.constant(1 : index) : i64 + %832 = llvm.alloca %831 x !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> : (i64) -> !llvm.ptr + llvm.store %822, %832 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>, !llvm.ptr + %833 = llvm.mlir.poison : !llvm.struct<(i64, ptr)> + %834 = llvm.insertvalue %830, %833[0] : !llvm.struct<(i64, ptr)> + %835 = llvm.insertvalue %832, %834[1] : !llvm.struct<(i64, ptr)> + %836 = llvm.mlir.constant(1 : index) : i64 + %837 = llvm.alloca %836 x !llvm.struct<(i64, ptr)> : (i64) -> !llvm.ptr + llvm.store %829, %837 : !llvm.struct<(i64, ptr)>, !llvm.ptr + %838 = llvm.alloca %836 x !llvm.struct<(i64, ptr)> : (i64) -> !llvm.ptr + llvm.store %835, %838 : !llvm.struct<(i64, ptr)>, !llvm.ptr + %839 = llvm.mlir.zero : !llvm.ptr + %840 = llvm.getelementptr %839[1] : (!llvm.ptr) -> !llvm.ptr, f32 + %841 = llvm.ptrtoint %840 : !llvm.ptr to i64 + llvm.call @memrefCopy(%841, %837, %838) : (i64, !llvm.ptr, !llvm.ptr) -> () + llvm.intr.stackrestore %823 : !llvm.ptr + %842 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %843 = llvm.insertvalue %781, %842[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %844 = llvm.insertvalue %788, %843[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %845 = llvm.mlir.constant(64 : index) : i64 + %846 = llvm.insertvalue %845, %844[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %847 = llvm.mlir.constant(1 : index) : i64 + %848 = llvm.insertvalue %847, %846[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %849 = llvm.mlir.constant(163840 : index) : i64 + %850 = llvm.insertvalue %849, %848[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %851 = llvm.mlir.constant(32 : index) : i64 + %852 = llvm.insertvalue %851, %850[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %853 = llvm.mlir.constant(5120 : index) : i64 + %854 = llvm.insertvalue %853, %852[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %855 = llvm.mlir.constant(40 : index) : i64 + %856 = llvm.insertvalue %855, %854[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %857 = llvm.mlir.constant(128 : index) : i64 + %858 = llvm.insertvalue %857, %856[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %859 = llvm.mlir.constant(64 : index) : i64 + %860 = llvm.insertvalue %859, %858[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %861 = llvm.mlir.constant(1 : index) : i64 + %862 = llvm.insertvalue %861, %860[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %863 = llvm.intr.stacksave : !llvm.ptr + %864 = llvm.mlir.constant(4 : i64) : i64 + %865 = llvm.mlir.constant(1 : index) : i64 + %866 = llvm.alloca %865 x !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> : (i64) -> !llvm.ptr + llvm.store %664, %866 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>, !llvm.ptr + %867 = llvm.mlir.poison : !llvm.struct<(i64, ptr)> + %868 = llvm.insertvalue %864, %867[0] : !llvm.struct<(i64, ptr)> + %869 = llvm.insertvalue %866, %868[1] : !llvm.struct<(i64, ptr)> + %870 = llvm.mlir.constant(4 : i64) : i64 + %871 = llvm.mlir.constant(1 : index) : i64 + %872 = llvm.alloca %871 x !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> : (i64) -> !llvm.ptr + llvm.store %862, %872 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>, !llvm.ptr + %873 = llvm.mlir.poison : !llvm.struct<(i64, ptr)> + %874 = llvm.insertvalue %870, %873[0] : !llvm.struct<(i64, ptr)> + %875 = llvm.insertvalue %872, %874[1] : !llvm.struct<(i64, ptr)> + %876 = llvm.mlir.constant(1 : index) : i64 + %877 = llvm.alloca %876 x !llvm.struct<(i64, ptr)> : (i64) -> !llvm.ptr + llvm.store %869, %877 : !llvm.struct<(i64, ptr)>, !llvm.ptr + %878 = llvm.alloca %876 x !llvm.struct<(i64, ptr)> : (i64) -> !llvm.ptr + llvm.store %875, %878 : !llvm.struct<(i64, ptr)>, !llvm.ptr + %879 = llvm.mlir.zero : !llvm.ptr + %880 = llvm.getelementptr %879[1] : (!llvm.ptr) -> !llvm.ptr, f32 + %881 = llvm.ptrtoint %880 : !llvm.ptr to i64 + llvm.call @memrefCopy(%881, %877, %878) : (i64, !llvm.ptr, !llvm.ptr) -> () + llvm.intr.stackrestore %863 : !llvm.ptr + %882 = llvm.mlir.constant(1 : index) : i64 + %883 = llvm.mlir.constant(32 : index) : i64 + %884 = llvm.mlir.constant(40 : index) : i64 + %885 = llvm.mlir.constant(128 : index) : i64 + %886 = llvm.mlir.constant(1 : index) : i64 + %887 = llvm.mlir.constant(5120 : index) : i64 + %888 = llvm.mlir.constant(163840 : index) : i64 + %889 = llvm.mlir.constant(163840 : index) : i64 + %890 = llvm.mlir.zero : !llvm.ptr + %891 = llvm.getelementptr %890[163840] : (!llvm.ptr) -> !llvm.ptr, f32 + %892 = llvm.ptrtoint %891 : !llvm.ptr to i64 + %893 = llvm.mlir.constant(64 : index) : i64 + %894 = llvm.add %892, %893 : i64 + %895 = llvm.call @malloc(%894) : (i64) -> !llvm.ptr + %896 = llvm.ptrtoint %895 : !llvm.ptr to i64 + %897 = llvm.mlir.constant(1 : index) : i64 + %898 = llvm.sub %893, %897 : i64 + %899 = llvm.add %896, %898 : i64 + %900 = llvm.urem %899, %893 : i64 + %901 = llvm.sub %899, %900 : i64 + %902 = llvm.inttoptr %901 : i64 to !llvm.ptr + %903 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %904 = llvm.insertvalue %895, %903[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %905 = llvm.insertvalue %902, %904[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %906 = llvm.mlir.constant(0 : index) : i64 + %907 = llvm.insertvalue %906, %905[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %908 = llvm.insertvalue %882, %907[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %909 = llvm.insertvalue %883, %908[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %910 = llvm.insertvalue %884, %909[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %911 = llvm.insertvalue %885, %910[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %912 = llvm.insertvalue %888, %911[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %913 = llvm.insertvalue %887, %912[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %914 = llvm.insertvalue %885, %913[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %915 = llvm.insertvalue %886, %914[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %916 = llvm.mlir.constant(0 : index) : i64 + %917 = llvm.mlir.constant(1 : index) : i64 + %918 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb43(%916 : i64) + ^bb43(%919: i64): // 2 preds: ^bb42, ^bb53 + %920 = llvm.icmp "slt" %919, %917 : i64 + llvm.cond_br %920, ^bb44, ^bb54 + ^bb44: // pred: ^bb43 + %921 = llvm.mlir.constant(0 : index) : i64 + %922 = llvm.mlir.constant(32 : index) : i64 + %923 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb45(%921 : i64) + ^bb45(%924: i64): // 2 preds: ^bb44, ^bb52 + %925 = llvm.icmp "slt" %924, %922 : i64 + llvm.cond_br %925, ^bb46, ^bb53 + ^bb46: // pred: ^bb45 + %926 = llvm.mlir.constant(0 : index) : i64 + %927 = llvm.mlir.constant(40 : index) : i64 + %928 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb47(%926 : i64) + ^bb47(%929: i64): // 2 preds: ^bb46, ^bb51 + %930 = llvm.icmp "slt" %929, %927 : i64 + llvm.cond_br %930, ^bb48, ^bb52 + ^bb48: // pred: ^bb47 + %931 = llvm.mlir.constant(0 : index) : i64 + %932 = llvm.mlir.constant(128 : index) : i64 + %933 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb49(%931 : i64) + ^bb49(%934: i64): // 2 preds: ^bb48, ^bb50 + %935 = llvm.icmp "slt" %934, %932 : i64 + llvm.cond_br %935, ^bb50, ^bb51 + ^bb50: // pred: ^bb49 + %936 = llvm.getelementptr %137[%143] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %937 = llvm.mul %147, %52 overflow : i64 + %938 = llvm.mul %929, %148 overflow : i64 + %939 = llvm.add %937, %938 overflow : i64 + %940 = llvm.mul %924, %151 overflow : i64 + %941 = llvm.add %939, %940 overflow : i64 + %942 = llvm.mul %934, %149 overflow : i64 + %943 = llvm.add %941, %942 overflow : i64 + %944 = llvm.getelementptr inbounds|nuw %936[%943] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %945 = llvm.load %944 : !llvm.ptr -> f32 + %946 = llvm.mlir.constant(163840 : index) : i64 + %947 = llvm.mul %52, %946 overflow : i64 + %948 = llvm.mlir.constant(5120 : index) : i64 + %949 = llvm.mul %924, %948 overflow : i64 + %950 = llvm.add %947, %949 overflow : i64 + %951 = llvm.mlir.constant(128 : index) : i64 + %952 = llvm.mul %929, %951 overflow : i64 + %953 = llvm.add %950, %952 overflow : i64 + %954 = llvm.add %953, %934 overflow : i64 + %955 = llvm.getelementptr inbounds|nuw %188[%954] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %945, %955 : f32, !llvm.ptr + %956 = llvm.mlir.constant(163840 : index) : i64 + %957 = llvm.mul %52, %956 overflow : i64 + %958 = llvm.mlir.constant(5120 : index) : i64 + %959 = llvm.mul %924, %958 overflow : i64 + %960 = llvm.add %957, %959 overflow : i64 + %961 = llvm.mlir.constant(128 : index) : i64 + %962 = llvm.mul %929, %961 overflow : i64 + %963 = llvm.add %960, %962 overflow : i64 + %964 = llvm.add %963, %934 overflow : i64 + %965 = llvm.getelementptr inbounds|nuw %188[%964] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %966 = llvm.load %965 : !llvm.ptr -> f32 + %967 = llvm.mlir.constant(5120 : index) : i64 + %968 = llvm.mul %52, %967 overflow : i64 + %969 = llvm.mlir.constant(5120 : index) : i64 + %970 = llvm.mul %52, %969 overflow : i64 + %971 = llvm.add %968, %970 overflow : i64 + %972 = llvm.mlir.constant(128 : index) : i64 + %973 = llvm.mul %929, %972 overflow : i64 + %974 = llvm.add %971, %973 overflow : i64 + %975 = llvm.add %974, %934 overflow : i64 + %976 = llvm.getelementptr inbounds|nuw %480[%975] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %977 = llvm.load %976 : !llvm.ptr -> f32 + %978 = llvm.fmul %966, %977 : f32 + %979 = llvm.mlir.constant(0 : index) : i64 + %980 = llvm.mlir.constant(0 : index) : i64 + %981 = llvm.mlir.constant(0 : index) : i64 + %982 = llvm.mlir.constant(0 : index) : i64 + %983 = llvm.add %979, %980 overflow : i64 + %984 = llvm.add %983, %981 overflow : i64 + %985 = llvm.add %984, %982 overflow : i64 + %986 = llvm.getelementptr inbounds|nuw %99[%985] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %978, %986 : f32, !llvm.ptr + %987 = llvm.mlir.constant(163840 : index) : i64 + %988 = llvm.mul %52, %987 overflow : i64 + %989 = llvm.mlir.constant(5120 : index) : i64 + %990 = llvm.mul %924, %989 overflow : i64 + %991 = llvm.add %988, %990 overflow : i64 + %992 = llvm.mlir.constant(128 : index) : i64 + %993 = llvm.mul %929, %992 overflow : i64 + %994 = llvm.add %991, %993 overflow : i64 + %995 = llvm.add %994, %934 overflow : i64 + %996 = llvm.getelementptr inbounds|nuw %788[%995] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %997 = llvm.load %996 : !llvm.ptr -> f32 + %998 = llvm.mlir.constant(5120 : index) : i64 + %999 = llvm.mul %52, %998 overflow : i64 + %1000 = llvm.mlir.constant(5120 : index) : i64 + %1001 = llvm.mul %52, %1000 overflow : i64 + %1002 = llvm.add %999, %1001 overflow : i64 + %1003 = llvm.mlir.constant(128 : index) : i64 + %1004 = llvm.mul %929, %1003 overflow : i64 + %1005 = llvm.add %1002, %1004 overflow : i64 + %1006 = llvm.add %1005, %934 overflow : i64 + %1007 = llvm.getelementptr inbounds|nuw %571[%1006] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %1008 = llvm.load %1007 : !llvm.ptr -> f32 + %1009 = llvm.fmul %997, %1008 : f32 + %1010 = llvm.mlir.constant(0 : index) : i64 + %1011 = llvm.mlir.constant(0 : index) : i64 + %1012 = llvm.mlir.constant(0 : index) : i64 + %1013 = llvm.mlir.constant(0 : index) : i64 + %1014 = llvm.add %1010, %1011 overflow : i64 + %1015 = llvm.add %1014, %1012 overflow : i64 + %1016 = llvm.add %1015, %1013 overflow : i64 + %1017 = llvm.getelementptr inbounds|nuw %121[%1016] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %1009, %1017 : f32, !llvm.ptr + %1018 = llvm.mlir.constant(0 : index) : i64 + %1019 = llvm.mlir.constant(0 : index) : i64 + %1020 = llvm.mlir.constant(0 : index) : i64 + %1021 = llvm.add %919, %1018 overflow : i64 + %1022 = llvm.add %1021, %1019 overflow : i64 + %1023 = llvm.add %1022, %1020 overflow : i64 + %1024 = llvm.getelementptr inbounds|nuw %99[%1023] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %1025 = llvm.load %1024 : !llvm.ptr -> f32 + %1026 = llvm.mlir.constant(0 : index) : i64 + %1027 = llvm.mlir.constant(0 : index) : i64 + %1028 = llvm.mlir.constant(0 : index) : i64 + %1029 = llvm.add %919, %1026 overflow : i64 + %1030 = llvm.add %1029, %1027 overflow : i64 + %1031 = llvm.add %1030, %1028 overflow : i64 + %1032 = llvm.getelementptr inbounds|nuw %121[%1031] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %1033 = llvm.load %1032 : !llvm.ptr -> f32 + %1034 = llvm.fadd %1025, %1033 : f32 + %1035 = llvm.mlir.constant(163840 : index) : i64 + %1036 = llvm.mul %919, %1035 overflow : i64 + %1037 = llvm.mlir.constant(5120 : index) : i64 + %1038 = llvm.mul %924, %1037 overflow : i64 + %1039 = llvm.add %1036, %1038 overflow : i64 + %1040 = llvm.mlir.constant(128 : index) : i64 + %1041 = llvm.mul %929, %1040 overflow : i64 + %1042 = llvm.add %1039, %1041 overflow : i64 + %1043 = llvm.add %1042, %934 overflow : i64 + %1044 = llvm.getelementptr inbounds|nuw %902[%1043] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %1034, %1044 : f32, !llvm.ptr + %1045 = llvm.add %934, %933 : i64 + llvm.br ^bb49(%1045 : i64) + ^bb51: // pred: ^bb49 + %1046 = llvm.add %929, %928 : i64 + llvm.br ^bb47(%1046 : i64) + ^bb52: // pred: ^bb47 + %1047 = llvm.add %924, %923 : i64 + llvm.br ^bb45(%1047 : i64) + ^bb53: // pred: ^bb45 + %1048 = llvm.add %919, %918 : i64 + llvm.br ^bb43(%1048 : i64) + ^bb54: // pred: ^bb43 + %1049 = llvm.mlir.constant(1 : index) : i64 + %1050 = llvm.mlir.constant(32 : index) : i64 + %1051 = llvm.mlir.constant(40 : index) : i64 + %1052 = llvm.mlir.constant(128 : index) : i64 + %1053 = llvm.mlir.constant(1 : index) : i64 + %1054 = llvm.mlir.constant(5120 : index) : i64 + %1055 = llvm.mlir.constant(163840 : index) : i64 + %1056 = llvm.mlir.constant(163840 : index) : i64 + %1057 = llvm.mlir.zero : !llvm.ptr + %1058 = llvm.getelementptr %1057[163840] : (!llvm.ptr) -> !llvm.ptr, f32 + %1059 = llvm.ptrtoint %1058 : !llvm.ptr to i64 + %1060 = llvm.mlir.constant(64 : index) : i64 + %1061 = llvm.add %1059, %1060 : i64 + %1062 = llvm.call @malloc(%1061) : (i64) -> !llvm.ptr + %1063 = llvm.ptrtoint %1062 : !llvm.ptr to i64 + %1064 = llvm.mlir.constant(1 : index) : i64 + %1065 = llvm.sub %1060, %1064 : i64 + %1066 = llvm.add %1063, %1065 : i64 + %1067 = llvm.urem %1066, %1060 : i64 + %1068 = llvm.sub %1066, %1067 : i64 + %1069 = llvm.inttoptr %1068 : i64 to !llvm.ptr + %1070 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1071 = llvm.insertvalue %1062, %1070[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1072 = llvm.insertvalue %1069, %1071[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1073 = llvm.mlir.constant(0 : index) : i64 + %1074 = llvm.insertvalue %1073, %1072[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1075 = llvm.insertvalue %1049, %1074[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1076 = llvm.insertvalue %1050, %1075[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1077 = llvm.insertvalue %1051, %1076[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1078 = llvm.insertvalue %1052, %1077[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1079 = llvm.insertvalue %1055, %1078[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1080 = llvm.insertvalue %1054, %1079[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1081 = llvm.insertvalue %1052, %1080[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1082 = llvm.insertvalue %1053, %1081[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1083 = llvm.mlir.constant(0 : index) : i64 + %1084 = llvm.mlir.constant(1 : index) : i64 + %1085 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb55(%1083 : i64) + ^bb55(%1086: i64): // 2 preds: ^bb54, ^bb65 + %1087 = llvm.icmp "slt" %1086, %1084 : i64 + llvm.cond_br %1087, ^bb56, ^bb66 + ^bb56: // pred: ^bb55 + %1088 = llvm.mlir.constant(0 : index) : i64 + %1089 = llvm.mlir.constant(32 : index) : i64 + %1090 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb57(%1088 : i64) + ^bb57(%1091: i64): // 2 preds: ^bb56, ^bb64 + %1092 = llvm.icmp "slt" %1091, %1089 : i64 + llvm.cond_br %1092, ^bb58, ^bb65 + ^bb58: // pred: ^bb57 + %1093 = llvm.mlir.constant(0 : index) : i64 + %1094 = llvm.mlir.constant(40 : index) : i64 + %1095 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb59(%1093 : i64) + ^bb59(%1096: i64): // 2 preds: ^bb58, ^bb63 + %1097 = llvm.icmp "slt" %1096, %1094 : i64 + llvm.cond_br %1097, ^bb60, ^bb64 + ^bb60: // pred: ^bb59 + %1098 = llvm.mlir.constant(0 : index) : i64 + %1099 = llvm.mlir.constant(128 : index) : i64 + %1100 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb61(%1098 : i64) + ^bb61(%1101: i64): // 2 preds: ^bb60, ^bb62 + %1102 = llvm.icmp "slt" %1101, %1099 : i64 + llvm.cond_br %1102, ^bb62, ^bb63 + ^bb62: // pred: ^bb61 + %1103 = llvm.getelementptr %203[%209] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %1104 = llvm.mul %213, %52 overflow : i64 + %1105 = llvm.mul %1096, %214 overflow : i64 + %1106 = llvm.add %1104, %1105 overflow : i64 + %1107 = llvm.mul %1091, %217 overflow : i64 + %1108 = llvm.add %1106, %1107 overflow : i64 + %1109 = llvm.mul %1101, %215 overflow : i64 + %1110 = llvm.add %1108, %1109 overflow : i64 + %1111 = llvm.getelementptr inbounds|nuw %1103[%1110] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %1112 = llvm.load %1111 : !llvm.ptr -> f32 + %1113 = llvm.mlir.constant(163840 : index) : i64 + %1114 = llvm.mul %52, %1113 overflow : i64 + %1115 = llvm.mlir.constant(5120 : index) : i64 + %1116 = llvm.mul %1091, %1115 overflow : i64 + %1117 = llvm.add %1114, %1116 overflow : i64 + %1118 = llvm.mlir.constant(128 : index) : i64 + %1119 = llvm.mul %1096, %1118 overflow : i64 + %1120 = llvm.add %1117, %1119 overflow : i64 + %1121 = llvm.add %1120, %1101 overflow : i64 + %1122 = llvm.getelementptr inbounds|nuw %254[%1121] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %1112, %1122 : f32, !llvm.ptr + %1123 = llvm.mlir.constant(163840 : index) : i64 + %1124 = llvm.mul %1086, %1123 overflow : i64 + %1125 = llvm.mlir.constant(5120 : index) : i64 + %1126 = llvm.mul %1091, %1125 overflow : i64 + %1127 = llvm.add %1124, %1126 overflow : i64 + %1128 = llvm.mlir.constant(128 : index) : i64 + %1129 = llvm.mul %1096, %1128 overflow : i64 + %1130 = llvm.add %1127, %1129 overflow : i64 + %1131 = llvm.add %1130, %1101 overflow : i64 + %1132 = llvm.getelementptr inbounds|nuw %254[%1131] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %1133 = llvm.load %1132 : !llvm.ptr -> f32 + %1134 = llvm.mlir.constant(5120 : index) : i64 + %1135 = llvm.mul %1086, %1134 overflow : i64 + %1136 = llvm.mlir.constant(5120 : index) : i64 + %1137 = llvm.mul %52, %1136 overflow : i64 + %1138 = llvm.add %1135, %1137 overflow : i64 + %1139 = llvm.mlir.constant(128 : index) : i64 + %1140 = llvm.mul %1096, %1139 overflow : i64 + %1141 = llvm.add %1138, %1140 overflow : i64 + %1142 = llvm.add %1141, %1101 overflow : i64 + %1143 = llvm.getelementptr inbounds|nuw %480[%1142] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %1144 = llvm.load %1143 : !llvm.ptr -> f32 + %1145 = llvm.fmul %1133, %1144 : f32 + %1146 = llvm.mlir.constant(163840 : index) : i64 + %1147 = llvm.mul %1086, %1146 overflow : i64 + %1148 = llvm.mlir.constant(5120 : index) : i64 + %1149 = llvm.mul %1091, %1148 overflow : i64 + %1150 = llvm.add %1147, %1149 overflow : i64 + %1151 = llvm.mlir.constant(128 : index) : i64 + %1152 = llvm.mul %1096, %1151 overflow : i64 + %1153 = llvm.add %1150, %1152 overflow : i64 + %1154 = llvm.add %1153, %1101 overflow : i64 + %1155 = llvm.getelementptr inbounds|nuw %1069[%1154] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %1145, %1155 : f32, !llvm.ptr + %1156 = llvm.add %1101, %1100 : i64 + llvm.br ^bb61(%1156 : i64) + ^bb63: // pred: ^bb61 + %1157 = llvm.add %1096, %1095 : i64 + llvm.br ^bb59(%1157 : i64) + ^bb64: // pred: ^bb59 + %1158 = llvm.add %1091, %1090 : i64 + llvm.br ^bb57(%1158 : i64) + ^bb65: // pred: ^bb57 + %1159 = llvm.add %1086, %1085 : i64 + llvm.br ^bb55(%1159 : i64) + ^bb66: // pred: ^bb55 + %1160 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1161 = llvm.insertvalue %247, %1160[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1162 = llvm.insertvalue %254, %1161[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1163 = llvm.mlir.constant(0 : index) : i64 + %1164 = llvm.insertvalue %1163, %1162[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1165 = llvm.mlir.constant(1 : index) : i64 + %1166 = llvm.insertvalue %1165, %1164[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1167 = llvm.mlir.constant(163840 : index) : i64 + %1168 = llvm.insertvalue %1167, %1166[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1169 = llvm.mlir.constant(32 : index) : i64 + %1170 = llvm.insertvalue %1169, %1168[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1171 = llvm.mlir.constant(5120 : index) : i64 + %1172 = llvm.insertvalue %1171, %1170[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1173 = llvm.mlir.constant(40 : index) : i64 + %1174 = llvm.insertvalue %1173, %1172[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1175 = llvm.mlir.constant(128 : index) : i64 + %1176 = llvm.insertvalue %1175, %1174[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1177 = llvm.mlir.constant(64 : index) : i64 + %1178 = llvm.insertvalue %1177, %1176[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1179 = llvm.mlir.constant(1 : index) : i64 + %1180 = llvm.insertvalue %1179, %1178[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1181 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1182 = llvm.insertvalue %247, %1181[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1183 = llvm.insertvalue %254, %1182[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1184 = llvm.mlir.constant(64 : index) : i64 + %1185 = llvm.insertvalue %1184, %1183[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1186 = llvm.mlir.constant(1 : index) : i64 + %1187 = llvm.insertvalue %1186, %1185[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1188 = llvm.mlir.constant(163840 : index) : i64 + %1189 = llvm.insertvalue %1188, %1187[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1190 = llvm.mlir.constant(32 : index) : i64 + %1191 = llvm.insertvalue %1190, %1189[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1192 = llvm.mlir.constant(5120 : index) : i64 + %1193 = llvm.insertvalue %1192, %1191[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1194 = llvm.mlir.constant(40 : index) : i64 + %1195 = llvm.insertvalue %1194, %1193[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1196 = llvm.mlir.constant(128 : index) : i64 + %1197 = llvm.insertvalue %1196, %1195[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1198 = llvm.mlir.constant(64 : index) : i64 + %1199 = llvm.insertvalue %1198, %1197[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1200 = llvm.mlir.constant(1 : index) : i64 + %1201 = llvm.insertvalue %1200, %1199[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1202 = llvm.mlir.constant(1 : index) : i64 + %1203 = llvm.mlir.constant(32 : index) : i64 + %1204 = llvm.mlir.constant(40 : index) : i64 + %1205 = llvm.mlir.constant(64 : index) : i64 + %1206 = llvm.mlir.constant(1 : index) : i64 + %1207 = llvm.mlir.constant(2560 : index) : i64 + %1208 = llvm.mlir.constant(81920 : index) : i64 + %1209 = llvm.mlir.constant(81920 : index) : i64 + %1210 = llvm.mlir.zero : !llvm.ptr + %1211 = llvm.getelementptr %1210[81920] : (!llvm.ptr) -> !llvm.ptr, f32 + %1212 = llvm.ptrtoint %1211 : !llvm.ptr to i64 + %1213 = llvm.mlir.constant(64 : index) : i64 + %1214 = llvm.add %1212, %1213 : i64 + %1215 = llvm.call @malloc(%1214) : (i64) -> !llvm.ptr + %1216 = llvm.ptrtoint %1215 : !llvm.ptr to i64 + %1217 = llvm.mlir.constant(1 : index) : i64 + %1218 = llvm.sub %1213, %1217 : i64 + %1219 = llvm.add %1216, %1218 : i64 + %1220 = llvm.urem %1219, %1213 : i64 + %1221 = llvm.sub %1219, %1220 : i64 + %1222 = llvm.inttoptr %1221 : i64 to !llvm.ptr + %1223 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1224 = llvm.insertvalue %1215, %1223[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1225 = llvm.insertvalue %1222, %1224[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1226 = llvm.mlir.constant(0 : index) : i64 + %1227 = llvm.insertvalue %1226, %1225[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1228 = llvm.insertvalue %1202, %1227[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1229 = llvm.insertvalue %1203, %1228[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1230 = llvm.insertvalue %1204, %1229[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1231 = llvm.insertvalue %1205, %1230[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1232 = llvm.insertvalue %1208, %1231[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1233 = llvm.insertvalue %1207, %1232[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1234 = llvm.insertvalue %1205, %1233[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1235 = llvm.insertvalue %1206, %1234[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1236 = llvm.mlir.constant(0 : index) : i64 + %1237 = llvm.mlir.constant(1 : index) : i64 + %1238 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb67(%1236 : i64) + ^bb67(%1239: i64): // 2 preds: ^bb66, ^bb77 + %1240 = llvm.icmp "slt" %1239, %1237 : i64 + llvm.cond_br %1240, ^bb68, ^bb78 + ^bb68: // pred: ^bb67 + %1241 = llvm.mlir.constant(0 : index) : i64 + %1242 = llvm.mlir.constant(32 : index) : i64 + %1243 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb69(%1241 : i64) + ^bb69(%1244: i64): // 2 preds: ^bb68, ^bb76 + %1245 = llvm.icmp "slt" %1244, %1242 : i64 + llvm.cond_br %1245, ^bb70, ^bb77 + ^bb70: // pred: ^bb69 + %1246 = llvm.mlir.constant(0 : index) : i64 + %1247 = llvm.mlir.constant(40 : index) : i64 + %1248 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb71(%1246 : i64) + ^bb71(%1249: i64): // 2 preds: ^bb70, ^bb75 + %1250 = llvm.icmp "slt" %1249, %1247 : i64 + llvm.cond_br %1250, ^bb72, ^bb76 + ^bb72: // pred: ^bb71 + %1251 = llvm.mlir.constant(0 : index) : i64 + %1252 = llvm.mlir.constant(64 : index) : i64 + %1253 = llvm.mlir.constant(1 : index) : i64 + llvm.br ^bb73(%1251 : i64) + ^bb73(%1254: i64): // 2 preds: ^bb72, ^bb74 + %1255 = llvm.icmp "slt" %1254, %1252 : i64 + llvm.cond_br %1255, ^bb74, ^bb75 + ^bb74: // pred: ^bb73 + %1256 = llvm.mlir.constant(64 : index) : i64 + %1257 = llvm.getelementptr %254[64] : (!llvm.ptr) -> !llvm.ptr, f32 + %1258 = llvm.mlir.constant(163840 : index) : i64 + %1259 = llvm.mul %1239, %1258 overflow : i64 + %1260 = llvm.mlir.constant(5120 : index) : i64 + %1261 = llvm.mul %1244, %1260 overflow : i64 + %1262 = llvm.add %1259, %1261 overflow : i64 + %1263 = llvm.mlir.constant(128 : index) : i64 + %1264 = llvm.mul %1249, %1263 overflow : i64 + %1265 = llvm.add %1262, %1264 overflow : i64 + %1266 = llvm.add %1265, %1254 overflow : i64 + %1267 = llvm.getelementptr inbounds|nuw %1257[%1266] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + %1268 = llvm.load %1267 : !llvm.ptr -> f32 + %1269 = llvm.fneg %1268 : f32 + %1270 = llvm.mlir.constant(81920 : index) : i64 + %1271 = llvm.mul %1239, %1270 overflow : i64 + %1272 = llvm.mlir.constant(2560 : index) : i64 + %1273 = llvm.mul %1244, %1272 overflow : i64 + %1274 = llvm.add %1271, %1273 overflow : i64 + %1275 = llvm.mlir.constant(64 : index) : i64 + %1276 = llvm.mul %1249, %1275 overflow : i64 + %1277 = llvm.add %1274, %1276 overflow : i64 + %1278 = llvm.add %1277, %1254 overflow : i64 + %1279 = llvm.getelementptr inbounds|nuw %1222[%1278] : (!llvm.ptr, i64) -> !llvm.ptr, f32 + llvm.store %1269, %1279 : f32, !llvm.ptr + %1280 = llvm.add %1254, %1253 : i64 + llvm.br ^bb73(%1280 : i64) + ^bb75: // pred: ^bb73 + %1281 = llvm.add %1249, %1248 : i64 + llvm.br ^bb71(%1281 : i64) + ^bb76: // pred: ^bb71 + %1282 = llvm.add %1244, %1243 : i64 + llvm.br ^bb69(%1282 : i64) + ^bb77: // pred: ^bb69 + %1283 = llvm.add %1239, %1238 : i64 + llvm.br ^bb67(%1283 : i64) + ^bb78: // pred: ^bb67 + %1284 = llvm.mlir.constant(1 : index) : i64 + %1285 = llvm.mlir.constant(32 : index) : i64 + %1286 = llvm.mlir.constant(40 : index) : i64 + %1287 = llvm.mlir.constant(128 : index) : i64 + %1288 = llvm.mlir.constant(1 : index) : i64 + %1289 = llvm.mlir.constant(5120 : index) : i64 + %1290 = llvm.mlir.constant(163840 : index) : i64 + %1291 = llvm.mlir.constant(163840 : index) : i64 + %1292 = llvm.mlir.zero : !llvm.ptr + %1293 = llvm.getelementptr %1292[163840] : (!llvm.ptr) -> !llvm.ptr, f32 + %1294 = llvm.ptrtoint %1293 : !llvm.ptr to i64 + %1295 = llvm.mlir.constant(64 : index) : i64 + %1296 = llvm.add %1294, %1295 : i64 + %1297 = llvm.call @malloc(%1296) : (i64) -> !llvm.ptr + %1298 = llvm.ptrtoint %1297 : !llvm.ptr to i64 + %1299 = llvm.mlir.constant(1 : index) : i64 + %1300 = llvm.sub %1295, %1299 : i64 + %1301 = llvm.add %1298, %1300 : i64 + %1302 = llvm.urem %1301, %1295 : i64 + %1303 = llvm.sub %1301, %1302 : i64 + %1304 = llvm.inttoptr %1303 : i64 to !llvm.ptr + %1305 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1306 = llvm.insertvalue %1297, %1305[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1307 = llvm.insertvalue %1304, %1306[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1308 = llvm.mlir.constant(0 : index) : i64 + %1309 = llvm.insertvalue %1308, %1307[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1310 = llvm.insertvalue %1284, %1309[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1311 = llvm.insertvalue %1285, %1310[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1312 = llvm.insertvalue %1286, %1311[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1313 = llvm.insertvalue %1287, %1312[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1314 = llvm.insertvalue %1290, %1313[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1315 = llvm.insertvalue %1289, %1314[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1316 = llvm.insertvalue %1287, %1315[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1317 = llvm.insertvalue %1288, %1316[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1318 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1319 = llvm.insertvalue %1297, %1318[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1320 = llvm.insertvalue %1304, %1319[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1321 = llvm.mlir.constant(0 : index) : i64 + %1322 = llvm.insertvalue %1321, %1320[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1323 = llvm.mlir.constant(1 : index) : i64 + %1324 = llvm.insertvalue %1323, %1322[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1325 = llvm.mlir.constant(163840 : index) : i64 + %1326 = llvm.insertvalue %1325, %1324[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1327 = llvm.mlir.constant(32 : index) : i64 + %1328 = llvm.insertvalue %1327, %1326[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1329 = llvm.mlir.constant(5120 : index) : i64 + %1330 = llvm.insertvalue %1329, %1328[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1331 = llvm.mlir.constant(40 : index) : i64 + %1332 = llvm.insertvalue %1331, %1330[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1333 = llvm.mlir.constant(128 : index) : i64 + %1334 = llvm.insertvalue %1333, %1332[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1335 = llvm.mlir.constant(64 : index) : i64 + %1336 = llvm.insertvalue %1335, %1334[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1337 = llvm.mlir.constant(1 : index) : i64 + %1338 = llvm.insertvalue %1337, %1336[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1339 = llvm.intr.stacksave : !llvm.ptr + %1340 = llvm.mlir.constant(4 : i64) : i64 + %1341 = llvm.mlir.constant(1 : index) : i64 + %1342 = llvm.alloca %1341 x !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> : (i64) -> !llvm.ptr + llvm.store %1235, %1342 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>, !llvm.ptr + %1343 = llvm.mlir.poison : !llvm.struct<(i64, ptr)> + %1344 = llvm.insertvalue %1340, %1343[0] : !llvm.struct<(i64, ptr)> + %1345 = llvm.insertvalue %1342, %1344[1] : !llvm.struct<(i64, ptr)> + %1346 = llvm.mlir.constant(4 : i64) : i64 + %1347 = llvm.mlir.constant(1 : index) : i64 + %1348 = llvm.alloca %1347 x !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> : (i64) -> !llvm.ptr + llvm.store %1338, %1348 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>, !llvm.ptr + %1349 = llvm.mlir.poison : !llvm.struct<(i64, ptr)> + %1350 = llvm.insertvalue %1346, %1349[0] : !llvm.struct<(i64, ptr)> + %1351 = llvm.insertvalue %1348, %1350[1] : !llvm.struct<(i64, ptr)> + %1352 = llvm.mlir.constant(1 : index) : i64 + %1353 = llvm.alloca %1352 x !llvm.struct<(i64, ptr)> : (i64) -> !llvm.ptr + llvm.store %1345, %1353 : !llvm.struct<(i64, ptr)>, !llvm.ptr + %1354 = llvm.alloca %1352 x !llvm.struct<(i64, ptr)> : (i64) -> !llvm.ptr + llvm.store %1351, %1354 : !llvm.struct<(i64, ptr)>, !llvm.ptr + %1355 = llvm.mlir.zero : !llvm.ptr + %1356 = llvm.getelementptr %1355[1] : (!llvm.ptr) -> !llvm.ptr, f32 + %1357 = llvm.ptrtoint %1356 : !llvm.ptr to i64 + llvm.call @memrefCopy(%1357, %1353, %1354) : (i64, !llvm.ptr, !llvm.ptr) -> () + llvm.intr.stackrestore %1339 : !llvm.ptr + %1358 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1359 = llvm.insertvalue %1297, %1358[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1360 = llvm.insertvalue %1304, %1359[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1361 = llvm.mlir.constant(64 : index) : i64 + %1362 = llvm.insertvalue %1361, %1360[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1363 = llvm.mlir.constant(1 : index) : i64 + %1364 = llvm.insertvalue %1363, %1362[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1365 = llvm.mlir.constant(163840 : index) : i64 + %1366 = llvm.insertvalue %1365, %1364[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1367 = llvm.mlir.constant(32 : index) : i64 + %1368 = llvm.insertvalue %1367, %1366[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1369 = llvm.mlir.constant(5120 : index) : i64 + %1370 = llvm.insertvalue %1369, %1368[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1371 = llvm.mlir.constant(40 : index) : i64 + %1372 = llvm.insertvalue %1371, %1370[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1373 = llvm.mlir.constant(128 : index) : i64 + %1374 = llvm.insertvalue %1373, %1372[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1375 = llvm.mlir.constant(64 : index) : i64 + %1376 = llvm.insertvalue %1375, %1374[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1377 = llvm.mlir.constant(1 : index) : i64 + %1378 = llvm.insertvalue %1377, %1376[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %1379 = llvm.intr.stacksave : !llvm.ptr + %1380 = llvm.mlir.constant(4 : i64) : i64 + %1381 = llvm.mlir.constant(1 : index) : i64 + %1382 = llvm.alloca %1381 x !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> : (i64) -> !llvm.ptr + llvm.store %1180, %1382 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>, !llvm.ptr + %1383 = llvm.mlir.poison : !llvm.struct<(i64, ptr)> + %1384 = llvm.insertvalue %1380, %1383[0] : !llvm.struct<(i64, ptr)> + %1385 = llvm.insertvalue %1382, %1384[1] : !llvm.struct<(i64, ptr)> + %1386 = llvm.mlir.constant(4 : i64) : i64 + %1387 = llvm.mlir.constant(1 : index) : i64 + %1388 = llvm.alloca %1387 x !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> : (i64) -> !llvm.ptr + llvm.store %1378, %1388 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>, !llvm.ptr + %1389 = llvm.mlir.poison : !llvm.struct<(i64, ptr)> + %1390 = llvm.insertvalue %1386, %1389[0] : !llvm.struct<(i64, ptr)> + %1391 = llvm.insertvalue %1388, %1390[1] : !llvm.struct<(i64, ptr)> + %1392 = llvm.mlir.constant(1 : index) : i64 + %1393 = llvm.alloca %1392 x !llvm.struct<(i64, ptr)> : (i64) -> !llvm.ptr + llvm.store %1385, %1393 : !llvm.struct<(i64, ptr)>, !llvm.ptr + %1394 = llvm.alloca %1392 x !llvm.struct<(i64, ptr)> : (i64) -> !llvm.ptr + llvm.store %1391, %1394 : !llvm.struct<(i64, ptr)>, !llvm.ptr + %1395 = llvm.mlir.zero : !llvm.ptr + %1396 = llvm.getelementptr %1395[1] : (!llvm.ptr) -> !llvm.ptr, f32 + %1397 = llvm.ptrtoint %1396 : !llvm.ptr to i64 + llvm.call @memrefCopy(%1397, %1393, %1394) : (i64, !llvm.ptr, !llvm.ptr) -> () + llvm.intr.stackrestore %1379 : !llvm.ptr + %1398 = llvm.call @rtclock() : () -> f64 + %1399 = llvm.fsub %1398, %135 : f64 + %1400 = llvm.mlir.constant(1 : index) : i64 + %1401 = llvm.alloca %1400 x !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> : (i64) -> !llvm.ptr + llvm.store %1317, %1401 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>, !llvm.ptr + %1402 = llvm.mlir.constant(4 : index) : i64 + %1403 = llvm.mlir.poison : !llvm.struct<(i64, ptr)> + %1404 = llvm.insertvalue %1402, %1403[0] : !llvm.struct<(i64, ptr)> + %1405 = llvm.insertvalue %1401, %1404[1] : !llvm.struct<(i64, ptr)> + %1406 = llvm.mlir.constant(1 : index) : i64 + %1407 = llvm.alloca %1406 x !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> : (i64) -> !llvm.ptr + llvm.store %915, %1407 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>, !llvm.ptr + %1408 = llvm.mlir.constant(4 : index) : i64 + %1409 = llvm.mlir.poison : !llvm.struct<(i64, ptr)> + %1410 = llvm.insertvalue %1408, %1409[0] : !llvm.struct<(i64, ptr)> + %1411 = llvm.insertvalue %1407, %1410[1] : !llvm.struct<(i64, ptr)> + %1412 = llvm.mlir.constant(1 : index) : i64 + %1413 = llvm.alloca %1412 x !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> : (i64) -> !llvm.ptr + llvm.store %1082, %1413 : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)>, !llvm.ptr + %1414 = llvm.mlir.constant(4 : index) : i64 + %1415 = llvm.mlir.poison : !llvm.struct<(i64, ptr)> + %1416 = llvm.insertvalue %1414, %1415[0] : !llvm.struct<(i64, ptr)> + %1417 = llvm.insertvalue %1413, %1416[1] : !llvm.struct<(i64, ptr)> + llvm.call @printMemrefF32(%1402, %1401) : (i64, !llvm.ptr) -> () + llvm.call @printMemrefF32(%1408, %1407) : (i64, !llvm.ptr) -> () + llvm.call @printMemrefF32(%1414, %1413) : (i64, !llvm.ptr) -> () + llvm.call @printF64(%1399) : (f64) -> () + llvm.call @printNewline() : () -> () + llvm.return + } + llvm.func @main() { + %0 = llvm.mlir.constant(1 : index) : i64 + %1 = llvm.mlir.constant(40 : index) : i64 + %2 = llvm.mlir.constant(4096 : index) : i64 + %3 = llvm.mlir.constant(1 : index) : i64 + %4 = llvm.mlir.constant(163840 : index) : i64 + %5 = llvm.mlir.constant(163840 : index) : i64 + %6 = llvm.mlir.zero : !llvm.ptr + %7 = llvm.getelementptr %6[163840] : (!llvm.ptr) -> !llvm.ptr, f32 + %8 = llvm.ptrtoint %7 : !llvm.ptr to i64 + %9 = llvm.mlir.addressof @__constant_1x40x4096xf32 : !llvm.ptr + %10 = llvm.getelementptr %9[0, 0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<1 x array<40 x array<4096 x f32>>> + %11 = llvm.mlir.constant(3735928559 : index) : i64 + %12 = llvm.inttoptr %11 : i64 to !llvm.ptr + %13 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %14 = llvm.insertvalue %12, %13[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %15 = llvm.insertvalue %10, %14[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %16 = llvm.mlir.constant(0 : index) : i64 + %17 = llvm.insertvalue %16, %15[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %18 = llvm.insertvalue %0, %17[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %19 = llvm.insertvalue %1, %18[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %20 = llvm.insertvalue %2, %19[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %21 = llvm.insertvalue %4, %20[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %22 = llvm.insertvalue %2, %21[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %23 = llvm.insertvalue %3, %22[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %24 = llvm.mlir.constant(1 : index) : i64 + %25 = llvm.mlir.constant(40 : index) : i64 + %26 = llvm.mlir.constant(4096 : index) : i64 + %27 = llvm.mlir.constant(1 : index) : i64 + %28 = llvm.mlir.constant(163840 : index) : i64 + %29 = llvm.mlir.constant(163840 : index) : i64 + %30 = llvm.mlir.zero : !llvm.ptr + %31 = llvm.getelementptr %30[163840] : (!llvm.ptr) -> !llvm.ptr, f32 + %32 = llvm.ptrtoint %31 : !llvm.ptr to i64 + %33 = llvm.mlir.addressof @__constant_1x40x4096xf32_0 : !llvm.ptr + %34 = llvm.getelementptr %33[0, 0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<1 x array<40 x array<4096 x f32>>> + %35 = llvm.mlir.constant(3735928559 : index) : i64 + %36 = llvm.inttoptr %35 : i64 to !llvm.ptr + %37 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %38 = llvm.insertvalue %36, %37[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %39 = llvm.insertvalue %34, %38[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %40 = llvm.mlir.constant(0 : index) : i64 + %41 = llvm.insertvalue %40, %39[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %42 = llvm.insertvalue %24, %41[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %43 = llvm.insertvalue %25, %42[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %44 = llvm.insertvalue %26, %43[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %45 = llvm.insertvalue %28, %44[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %46 = llvm.insertvalue %26, %45[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %47 = llvm.insertvalue %27, %46[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %48 = llvm.mlir.constant(1 : index) : i64 + %49 = llvm.mlir.constant(40 : index) : i64 + %50 = llvm.mlir.constant(4096 : index) : i64 + %51 = llvm.mlir.constant(1 : index) : i64 + %52 = llvm.mlir.constant(163840 : index) : i64 + %53 = llvm.mlir.constant(163840 : index) : i64 + %54 = llvm.mlir.zero : !llvm.ptr + %55 = llvm.getelementptr %54[163840] : (!llvm.ptr) -> !llvm.ptr, f32 + %56 = llvm.ptrtoint %55 : !llvm.ptr to i64 + %57 = llvm.mlir.addressof @__constant_1x40x4096xf32_1 : !llvm.ptr + %58 = llvm.getelementptr %57[0, 0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<1 x array<40 x array<4096 x f32>>> + %59 = llvm.mlir.constant(3735928559 : index) : i64 + %60 = llvm.inttoptr %59 : i64 to !llvm.ptr + %61 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %62 = llvm.insertvalue %60, %61[0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %63 = llvm.insertvalue %58, %62[1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %64 = llvm.mlir.constant(0 : index) : i64 + %65 = llvm.insertvalue %64, %63[2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %66 = llvm.insertvalue %48, %65[3, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %67 = llvm.insertvalue %49, %66[3, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %68 = llvm.insertvalue %50, %67[3, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %69 = llvm.insertvalue %52, %68[4, 0] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %70 = llvm.insertvalue %50, %69[4, 1] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %71 = llvm.insertvalue %51, %70[4, 2] : !llvm.struct<(ptr, ptr, i64, array<3 x i64>, array<3 x i64>)> + %72 = llvm.mlir.constant(1 : index) : i64 + %73 = llvm.mlir.constant(1 : index) : i64 + %74 = llvm.mlir.constant(2048 : index) : i64 + %75 = llvm.mlir.constant(128 : index) : i64 + %76 = llvm.mlir.constant(1 : index) : i64 + %77 = llvm.mlir.constant(262144 : index) : i64 + %78 = llvm.mlir.constant(262144 : index) : i64 + %79 = llvm.mlir.constant(262144 : index) : i64 + %80 = llvm.mlir.zero : !llvm.ptr + %81 = llvm.getelementptr %80[262144] : (!llvm.ptr) -> !llvm.ptr, f32 + %82 = llvm.ptrtoint %81 : !llvm.ptr to i64 + %83 = llvm.mlir.addressof @__constant_1x1x2048x128xf32 : !llvm.ptr + %84 = llvm.getelementptr %83[0, 0, 0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<1 x array<1 x array<2048 x array<128 x f32>>>> + %85 = llvm.mlir.constant(3735928559 : index) : i64 + %86 = llvm.inttoptr %85 : i64 to !llvm.ptr + %87 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %88 = llvm.insertvalue %86, %87[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %89 = llvm.insertvalue %84, %88[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %90 = llvm.mlir.constant(0 : index) : i64 + %91 = llvm.insertvalue %90, %89[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %92 = llvm.insertvalue %72, %91[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %93 = llvm.insertvalue %73, %92[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %94 = llvm.insertvalue %74, %93[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %95 = llvm.insertvalue %75, %94[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %96 = llvm.insertvalue %78, %95[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %97 = llvm.insertvalue %77, %96[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %98 = llvm.insertvalue %75, %97[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %99 = llvm.insertvalue %76, %98[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %100 = llvm.mlir.constant(1 : index) : i64 + %101 = llvm.mlir.constant(1 : index) : i64 + %102 = llvm.mlir.constant(2048 : index) : i64 + %103 = llvm.mlir.constant(128 : index) : i64 + %104 = llvm.mlir.constant(1 : index) : i64 + %105 = llvm.mlir.constant(262144 : index) : i64 + %106 = llvm.mlir.constant(262144 : index) : i64 + %107 = llvm.mlir.constant(262144 : index) : i64 + %108 = llvm.mlir.zero : !llvm.ptr + %109 = llvm.getelementptr %108[262144] : (!llvm.ptr) -> !llvm.ptr, f32 + %110 = llvm.ptrtoint %109 : !llvm.ptr to i64 + %111 = llvm.mlir.addressof @__constant_1x1x2048x128xf32_2 : !llvm.ptr + %112 = llvm.getelementptr %111[0, 0, 0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<1 x array<1 x array<2048 x array<128 x f32>>>> + %113 = llvm.mlir.constant(3735928559 : index) : i64 + %114 = llvm.inttoptr %113 : i64 to !llvm.ptr + %115 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %116 = llvm.insertvalue %114, %115[0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %117 = llvm.insertvalue %112, %116[1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %118 = llvm.mlir.constant(0 : index) : i64 + %119 = llvm.insertvalue %118, %117[2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %120 = llvm.insertvalue %100, %119[3, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %121 = llvm.insertvalue %101, %120[3, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %122 = llvm.insertvalue %102, %121[3, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %123 = llvm.insertvalue %103, %122[3, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %124 = llvm.insertvalue %106, %123[4, 0] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %125 = llvm.insertvalue %105, %124[4, 1] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %126 = llvm.insertvalue %103, %125[4, 2] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %127 = llvm.insertvalue %104, %126[4, 3] : !llvm.struct<(ptr, ptr, i64, array<4 x i64>, array<4 x i64>)> + %128 = llvm.mlir.constant(1 : index) : i64 + %129 = llvm.mlir.constant(40 : index) : i64 + %130 = llvm.mlir.constant(1 : index) : i64 + %131 = llvm.mlir.constant(40 : index) : i64 + %132 = llvm.mlir.zero : !llvm.ptr + %133 = llvm.getelementptr %132[40] : (!llvm.ptr) -> !llvm.ptr, i64 + %134 = llvm.ptrtoint %133 : !llvm.ptr to i64 + %135 = llvm.mlir.addressof @__constant_1x40xi64 : !llvm.ptr + %136 = llvm.getelementptr %135[0, 0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.array<1 x array<40 x i64>> + %137 = llvm.mlir.constant(3735928559 : index) : i64 + %138 = llvm.inttoptr %137 : i64 to !llvm.ptr + %139 = llvm.mlir.poison : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %140 = llvm.insertvalue %138, %139[0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %141 = llvm.insertvalue %136, %140[1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %142 = llvm.mlir.constant(0 : index) : i64 + %143 = llvm.insertvalue %142, %141[2] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %144 = llvm.insertvalue %128, %143[3, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %145 = llvm.insertvalue %129, %144[3, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %146 = llvm.insertvalue %129, %145[4, 0] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + %147 = llvm.insertvalue %130, %146[4, 1] : !llvm.struct<(ptr, ptr, i64, array<2 x i64>, array<2 x i64>)> + llvm.call @kernel(%12, %10, %16, %0, %1, %2, %4, %2, %3, %36, %34, %40, %24, %25, %26, %28, %26, %27, %60, %58, %64, %48, %49, %50, %52, %50, %51, %86, %84, %90, %72, %73, %74, %75, %78, %77, %75, %76, %114, %112, %118, %100, %101, %102, %103, %106, %105, %103, %104, %138, %136, %142, %128, %129, %129, %130) : (!llvm.ptr, !llvm.ptr, i64, i64, i64, i64, i64, i64, i64, !llvm.ptr, !llvm.ptr, i64, i64, i64, i64, i64, i64, i64, !llvm.ptr, !llvm.ptr, i64, i64, i64, i64, i64, i64, i64, !llvm.ptr, !llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, !llvm.ptr, !llvm.ptr, i64, i64, i64, i64, i64, i64, i64, i64, i64, !llvm.ptr, !llvm.ptr, i64, i64, i64, i64, i64) -> () + llvm.return + } + llvm.func @printMemrefF32(i64, !llvm.ptr) attributes {sym_visibility = "private"} +} + diff --git a/rope-b2fe557ff-llvm21rc2.mlir b/rope-b2fe557ff-llvm21rc2.mlir new file mode 100644 index 0000000000..42aa7344a6 --- /dev/null +++ b/rope-b2fe557ff-llvm21rc2.mlir @@ -0,0 +1,184 @@ +// RUN: buddy-opt %s \ +// RUN: -pass-pipeline "builtin.module(func.func(tosa-to-linalg-named),func.func(tosa-to-linalg),func.func(tosa-to-tensor),func.func(tosa-to-arith),func.func(lower-affine))" \ +// RUN: | buddy-opt \ +// RUN: -arith-expand \ +// RUN: -eliminate-empty-tensors \ +// RUN: -empty-tensor-to-alloc-tensor \ +// RUN: -one-shot-bufferize="bufferize-function-boundaries" \ +// RUN: -convert-linalg-to-affine-loops \ +// RUN: -affine-loop-fusion \ +// RUN: -affine-parallelize \ +// RUN: -convert-vector-to-scf \ +// RUN: -expand-strided-metadata \ +// RUN: -convert-vector-to-llvm \ +// RUN: -memref-expand \ +// RUN: -arith-expand \ +// RUN: -lower-affine \ +// RUN: -convert-arith-to-llvm \ +// RUN: -finalize-memref-to-llvm \ +// RUN: -convert-scf-to-cf \ +// RUN: -convert-cf-to-llvm \ +// RUN: -convert-openmp-to-llvm \ +// RUN: -convert-arith-to-llvm \ +// RUN: -convert-math-to-llvm \ +// RUN: -convert-math-to-libm \ +// RUN: -convert-func-to-llvm \ +// RUN: -reconcile-unrealized-casts \ +// RUN: | mlir-runner -e main -entry-point-result=void \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext \ +// RUN: -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \ +// RUN: | FileCheck %s + +func.func private @rtclock() -> f64 + +#map = affine_map<(d0, d1, d2) -> (d1)> +#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> +#map2 = affine_map<(d0, d1, d2) -> (d0, d1)> +#map3 = affine_map<(d0, d1) -> (d0, d1)> +#map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)> +#map5 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> +#map6 = affine_map<(d0, d1, d2) -> (d0, 0, d1, d2)> +#map7 = affine_map<(d0, d1) -> (0, d0, d1)> + +func.func @kernel(%arg0 : tensor<1x40x4096xf32>, %arg1 : tensor<1x40x4096xf32>, %arg2 : tensor<1x40x4096xf32>, %arg3 : tensor<1x1x2048x128xf32>, %arg4 : tensor<1x1x2048x128xf32>, %arg5 : tensor<1x40xi64>) { + %t_start = call @rtclock() : () -> f64 + + %1 = tosa.const_shape {values = dense<[1, 40, 32, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %57 = tosa.reshape %arg0, %1 : (tensor<1x40x4096xf32>, !tosa.shape<4>) -> tensor<1x40x32x128xf32> + %59 = tosa.transpose %57 {perms = array} : (tensor<1x40x32x128xf32>) -> tensor<1x32x40x128xf32> + + %4 = tosa.const_shape {values = dense<[1, 40, 32, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %60 = tosa.reshape %arg1, %4 : (tensor<1x40x4096xf32>, !tosa.shape<4>) -> tensor<1x40x32x128xf32> + %62 = tosa.transpose %60 {perms = array} : (tensor<1x40x32x128xf32>) -> tensor<1x32x40x128xf32> + + %7 = tosa.const_shape {values = dense<[1, 40, 32, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %63 = tosa.reshape %arg2, %7 : (tensor<1x40x4096xf32>, !tosa.shape<4>) -> tensor<1x40x32x128xf32> + %65 = tosa.transpose %63 {perms = array} : (tensor<1x40x32x128xf32>) -> tensor<1x32x40x128xf32> + + %extracted_slice_9 = tensor.extract_slice %arg3[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32> + %extracted_slice_10 = tensor.extract_slice %extracted_slice_9[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32> + %extracted_slice_11 = tensor.extract_slice %extracted_slice_10[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32> + %extracted_slice_12 = tensor.extract_slice %arg4[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32> + %extracted_slice_13 = tensor.extract_slice %extracted_slice_12[0, 0, 0, 0] [1, 1, 2048, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x2048x128xf32> + %extracted_slice_14 = tensor.extract_slice %extracted_slice_13[0, 0, 0, 0] [1, 1, 40, 128] [1, 1, 1, 1] : tensor<1x1x2048x128xf32> to tensor<1x1x40x128xf32> + %66 = tensor.empty() : tensor<1x40x128xf32> + %67 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_11 : tensor<1x1x40x128xf32>) outs(%66 : tensor<1x40x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x40x128xf32> + %68 = tensor.empty() : tensor<40x128xf32> + %69 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%67 : tensor<1x40x128xf32>) outs(%68 : tensor<40x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<40x128xf32> + %70 = tensor.empty() : tensor<1x40x128xf32> + %71 = linalg.generic {indexing_maps = [#map6, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_14 : tensor<1x1x40x128xf32>) outs(%70 : tensor<1x40x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<1x40x128xf32> + %72 = tensor.empty() : tensor<40x128xf32> + %73 = linalg.generic {indexing_maps = [#map7, #map3], iterator_types = ["parallel", "parallel"]} ins(%71 : tensor<1x40x128xf32>) outs(%72 : tensor<40x128xf32>) { + ^bb0(%in: f32, %out: f32): + linalg.yield %in : f32 + } -> tensor<40x128xf32> + // precompute_theta_pos_frequencies function, which is used to calculating special values ​​of RoPE according to: https://hyper.ai/wiki/29220 + %74 = tensor.empty() : tensor<1x40x128xf32> + %75 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg5 : tensor<1x40xi64>) outs(%74 : tensor<1x40x128xf32>) { + ^bb0(%in: i64, %out: f32): + %4175 = arith.index_cast %in : i64 to index + %4176 = linalg.index 2 : index + %extracted = tensor.extract %69[%4175, %4176] : tensor<40x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x40x128xf32> + + %20 = tosa.const_shape {values = dense<[1, 1, 40, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %76 = tosa.reshape %75, %20 : (tensor<1x40x128xf32>, !tosa.shape<4>) -> tensor<1x1x40x128xf32> + %77 = tensor.empty() : tensor<1x40x128xf32> + %78 = linalg.generic {indexing_maps = [#map2, #map5], iterator_types = ["parallel", "parallel", "parallel"]} ins(%arg5 : tensor<1x40xi64>) outs(%77 : tensor<1x40x128xf32>) { + ^bb0(%in: i64, %out: f32): + %4175 = arith.index_cast %in : i64 to index + %4176 = linalg.index 2 : index + %extracted = tensor.extract %73[%4175, %4176] : tensor<40x128xf32> + linalg.yield %extracted : f32 + } -> tensor<1x40x128xf32> + %24 = tosa.const_shape {values = dense<[1, 1, 40, 128]> : tensor<4xindex>} : () -> !tosa.shape<4> + %79 = tosa.reshape %78, %24 : (tensor<1x40x128xf32>, !tosa.shape<4>) -> tensor<1x1x40x128xf32> + + %26 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %80 = tosa.mul %59, %76, %26 : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>, tensor<1xi8>) -> tensor<1x32x40x128xf32> + %extracted_slice_15 = tensor.extract_slice %59[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> + %extracted_slice_16 = tensor.extract_slice %59[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> + %37 = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32> + %38 = "tosa.const"() <{values = dense<0.000000e+00> : tensor<1xf32>}> : () -> tensor<1xf32> + %81 = tosa.negate %extracted_slice_16, %37, %38 : (tensor<1x32x40x64xf32>,tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x40x64xf32> + %82 = tensor.empty() : tensor<1x32x40x128xf32> + %inserted_slice = tensor.insert_slice %81 into %82[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> + %inserted_slice_17 = tensor.insert_slice %extracted_slice_15 into %inserted_slice[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> + %32 = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8> + %83 = tosa.mul %inserted_slice_17, %79, %32 : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>, tensor<1xi8>) -> tensor<1x32x40x128xf32> + %84 = tosa.add %80, %83 : (tensor<1x32x40x128xf32>, tensor<1x32x40x128xf32>) -> tensor<1x32x40x128xf32> + %85 = tosa.mul %62, %76, %32 : (tensor<1x32x40x128xf32>, tensor<1x1x40x128xf32>, tensor<1xi8>) -> tensor<1x32x40x128xf32> + %extracted_slice_18 = tensor.extract_slice %62[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> + %extracted_slice_19 = tensor.extract_slice %62[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x128xf32> to tensor<1x32x40x64xf32> + %86 = tosa.negate %extracted_slice_19, %37, %38 : (tensor<1x32x40x64xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x40x64xf32> + %87 = tensor.empty() : tensor<1x32x40x128xf32> + %inserted_slice_20 = tensor.insert_slice %86 into %87[0, 0, 0, 0] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> + %inserted_slice_21 = tensor.insert_slice %extracted_slice_18 into %inserted_slice_20[0, 0, 0, 64] [1, 32, 40, 64] [1, 1, 1, 1] : tensor<1x32x40x64xf32> into tensor<1x32x40x128xf32> + + %t_end = call @rtclock() : () -> f64 + %time = arith.subf %t_end, %t_start : f64 + + %tensor_unranked = tensor.cast %inserted_slice_21 : tensor<1x32x40x128xf32> to tensor<*xf32> + + // All the elements of the MemRef are the same, + // only check the first line to verify the correctness. + // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 32, 40, 128] strides = [163840, 5120, 128, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [ + // CHECK-SAME: [-3{{(, [-]?3)*}}], + + %tensor_unranked_1 = tensor.cast %84 : tensor<1x32x40x128xf32> to tensor<*xf32> + + // All the elements of the MemRef are the same, + // only check the first line to verify the correctness. + // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 32, 40, 128] strides = [163840, 5120, 128, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [ + // CHECK-SAME: [-2{{(, -2)*}}{{(, 22)*}}], + + %tensor_unranked_2 = tensor.cast %85 : tensor<1x32x40x128xf32> to tensor<*xf32> + + // All the elements of the MemRef are the same, + // only check the first line to verify the correctness. + // CHECK: Unranked Memref base@ = {{.*}} rank = 4 offset = 0 sizes = [1, 32, 40, 128] strides = [163840, 5120, 128, 1] data = + // CHECK-NEXT: [ + // CHECK-SAME: [ + // CHECK-SAME: [ + // CHECK-SAME: [15{{(, 15)*}}], + + // Print results. + call @printMemrefF32(%tensor_unranked) : (tensor<*xf32>) -> () + call @printMemrefF32(%tensor_unranked_1) : (tensor<*xf32>) -> () + call @printMemrefF32(%tensor_unranked_2) : (tensor<*xf32>) -> () + // Print timings. + vector.print %time : f64 + + return +} + +func.func @main() { + + %c2 = arith.constant dense<2.0> : tensor<1x40x4096xf32> + %c3 = arith.constant dense<3.0> : tensor<1x40x4096xf32> + %c4 = arith.constant dense<4.0> : tensor<1x40x4096xf32> + %c5 = arith.constant dense<5.0> : tensor<1x1x2048x128xf32> + %c6 = arith.constant dense<6.0> : tensor<1x1x2048x128xf32> + %c7 = arith.constant dense<7> : tensor<1x40xi64> + + call @kernel(%c2, %c3, %c4, %c5, %c6, %c7) : (tensor<1x40x4096xf32>, tensor<1x40x4096xf32>, tensor<1x40x4096xf32>, tensor<1x1x2048x128xf32>, tensor<1x1x2048x128xf32>, tensor<1x40xi64>) -> () + + return +} +func.func private @printMemrefF32(%ptr : tensor<*xf32>)