diff --git a/cmake/llvm-version-imex.txt b/cmake/llvm-version-imex.txt index 33000613b..04e84c095 100644 --- a/cmake/llvm-version-imex.txt +++ b/cmake/llvm-version-imex.txt @@ -1 +1 @@ -add6b2f35f2bcf1f59a2ab2d5b3dab124fe0895a +7842374103b26933d71a8fe354cd4d8715d55b1c diff --git a/cmake/llvm-version.txt b/cmake/llvm-version.txt index 33000613b..ccc906743 100644 --- a/cmake/llvm-version.txt +++ b/cmake/llvm-version.txt @@ -1 +1 @@ -add6b2f35f2bcf1f59a2ab2d5b3dab124fe0895a +3ae0f3047b5a0de8ef98c167610f6018f615b7ea \ No newline at end of file diff --git a/include/gc/Conversion/Passes.h b/include/gc/Conversion/Passes.h index 3eb6c09b9..1e36e798f 100644 --- a/include/gc/Conversion/Passes.h +++ b/include/gc/Conversion/Passes.h @@ -14,6 +14,9 @@ namespace mlir { +#define GEN_PASS_DECL +#include "gc/Conversion/Passes.h.inc" + /// Generate the code for registering conversion passes. #define GEN_PASS_REGISTRATION #include "gc/Conversion/Passes.h.inc" diff --git a/include/gc/Conversion/Passes.td b/include/gc/Conversion/Passes.td index 91cc04745..7998c8e70 100644 --- a/include/gc/Conversion/Passes.td +++ b/include/gc/Conversion/Passes.td @@ -22,4 +22,20 @@ def ConvertXeVMToLLVMPass : Pass<"convert-xevm-to-llvm"> { ]; } +//===----------------------------------------------------------------------===// +// XeGPUToXeVM +//===----------------------------------------------------------------------===// + +def ConvertXeGPUToXeVMPass : Pass<"convert-xegpu-to-xevm"> { + let summary = "Convert XeGPU to XeVM dialect"; + let dependentDialects = [ + "xegpu::XeGPUDialect", + "xevm::XeVMDialect", + "vector::VectorDialect", + "memref::MemRefDialect", + "arith::ArithDialect", + ]; +} + + #endif // GC_CONVERSION_PASSES diff --git a/include/gc/Conversion/XeGPUToXeVM/XeGPUToXeVM.h b/include/gc/Conversion/XeGPUToXeVM/XeGPUToXeVM.h new file mode 100644 index 000000000..be50bd6d8 --- /dev/null +++ b/include/gc/Conversion/XeGPUToXeVM/XeGPUToXeVM.h @@ -0,0 +1,28 @@ +//===-- XeGPUToXeVM.h - Convert XeVM to LLVM dialect -------------*- C++ +//-*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef MLIR_CONVERSION_XEGPUTOXEVM_XEGPUTOXEVMPASS_H_ +#define MLIR_CONVERSION_XEGPUTOXEVM_XEGPUTOXEVMPASS_H_ + +#include + +namespace mlir { +class DialectRegistry; +class LLVMTypeConverter; +class RewritePatternSet; +class Pass; + +#define GEN_PASS_DECL_CONVERTXEGPUTOXEVMPASS +#include "gc/Conversion/Passes.h.inc" + +void populateXeGPUToXeVMConversionPatterns(RewritePatternSet &patterns, + LLVMTypeConverter &typeConverter); + +} // namespace mlir + +#endif // MLIR_CONVERSION_XEGPUTOXEVM_XEGPUTOXEVMPASS_H_ diff --git a/include/gc/Dialect/LLVMIR/XeVMDialect.h b/include/gc/Dialect/LLVMIR/XeVMDialect.h index 80f68248f..b8e9a1c5b 100644 --- a/include/gc/Dialect/LLVMIR/XeVMDialect.h +++ b/include/gc/Dialect/LLVMIR/XeVMDialect.h @@ -24,4 +24,17 @@ #include "gc/Dialect/LLVMIR/XeVMOpsDialect.h.inc" +namespace mlir::xevm { +/// XeVM memory space identifiers following SPIRV storage class convention +/// https://github.com/KhronosGroup/SPIRV-LLVM-Translator/blob/main/docs/SPIRVRepresentationInLLVM.rst#address-spaces +/// +enum class XeVMMemorySpace : uint32_t { + kFunction = 0, // OpenCL workitem address space + kCrossWorkgroup = 1, // OpenCL Global memory + kUniformConstant = 2, // OpenCL Constant memory + kWorkgroup = 3, // OpenCL Local memory + kGeneric = 4 // OpenCL Generic memory +}; + +} // namespace mlir::xevm #endif /* MLIR_DIALECT_LLVMIR_XEVMDIALECT_H_ */ diff --git a/include/gc/Dialect/LLVMIR/XeVMOps.td b/include/gc/Dialect/LLVMIR/XeVMOps.td index 4b4ee6814..8653b8ac5 100644 --- a/include/gc/Dialect/LLVMIR/XeVMOps.td +++ b/include/gc/Dialect/LLVMIR/XeVMOps.td @@ -41,36 +41,44 @@ class XeVM_Op traits = []> : def XeVM_ElemType : AnyTypeOf<[AnyI8, AnyI16, AnyI32, F32, F16, BF16]>; -class XeVM_LoadCacheControl : I32EnumAttr, - I32EnumAttrCase<"UC", 1, !strconcat(cacheMnemonic, "UC")>, // uncached - I32EnumAttrCase<"C", 2, !strconcat(cacheMnemonic, "C")>, // cached - I32EnumAttrCase<"S", 3, !strconcat(cacheMnemonic, "S")>, // streaming - I32EnumAttrCase<"IAR", 4, !strconcat(cacheMnemonic, "IAR")>, // invalidate-after-read + I32EnumAttrCase<"UC", 1, "UC">, // uncached + I32EnumAttrCase<"C", 2, "C">, // cached + I32EnumAttrCase<"S", 3, "S">, // streaming + I32EnumAttrCase<"IAR", 4, "IAR">, // invalidate-after-read ]> { let cppNamespace = "::mlir::xevm"; + let genSpecializedAttr = 0; } -def XeVM_L1LoadCacheControl : XeVM_LoadCacheControl<"L1">; -def XeVM_L3LoadCacheControl : XeVM_LoadCacheControl<"L3">; +def XeVM_LoadCacheControlAttr: + EnumAttr { + let summary = [{ }]; + let assemblyFormat = "$value"; +} -class XeVM_StoreCacheControl : I32EnumAttr, - I32EnumAttrCase<"UC", 1, !strconcat(cacheMnemonic, "UC")>, // uncached - I32EnumAttrCase<"WT", 2, !strconcat(cacheMnemonic, "WT")>, // write-through - I32EnumAttrCase<"S", 3, !strconcat(cacheMnemonic, "S")>, // streaming - I32EnumAttrCase<"WB", 4, !strconcat(cacheMnemonic, "WB")>, // write back + I32EnumAttrCase<"UC", 1, "UC">, // uncached + I32EnumAttrCase<"WT", 2, "WT">, // write-through + I32EnumAttrCase<"S", 3, "S">, // streaming + I32EnumAttrCase<"WB", 4, "WB">, // write back ]> { let cppNamespace = "::mlir::xevm"; + let genSpecializedAttr = 0; } -def XeVM_L1StoreCacheControl : XeVM_StoreCacheControl<"L1">; -def XeVM_L3StoreCacheControl : XeVM_StoreCacheControl<"L3">; +def XeVM_StoreCacheControlAttr: + EnumAttr { + let summary = [{ }]; + let assemblyFormat = "$value"; +} def XeVM_BlockLoad2dOp : XeVM_Op<"blockload2d">, - Results<(outs FixedVectorOf<[XeVM_ElemType]>:$res)>, + Results<(outs FixedVectorOfRankAndType<[1,2,3], [XeVM_ElemType]>:$res)>, Arguments<(ins Arg:$ptr, I32:$base_width, @@ -84,8 +92,8 @@ def XeVM_BlockLoad2dOp : XeVM_Op<"blockload2d">, I32Attr:$v_blocks, I1Attr:$transpose, I1Attr:$vnni_transform, - DefaultValuedAttr:$l1_cache_control, - DefaultValuedAttr:$l3_cache_control + DefaultValuedAttr:$l1_cache_control, + DefaultValuedAttr:$l3_cache_control )> { let summary = "2D block load"; @@ -137,9 +145,9 @@ def XeVM_BlockStore2dOp : XeVM_Op<"blockstore2d">, I32Attr:$tile_width, I32Attr:$tile_height, I32Attr:$v_blocks, - FixedVectorOf<[XeVM_ElemType]>:$stored_val, - DefaultValuedAttr:$l1_cache_control, - DefaultValuedAttr:$l3_cache_control + FixedVectorOfRankAndType<[1, 2, 3], [XeVM_ElemType]>:$stored_val, + DefaultValuedAttr:$l1_cache_control, + DefaultValuedAttr:$l3_cache_control )> { let summary = "2D block store"; @@ -174,6 +182,86 @@ def XeVM_BlockStore2dOp : XeVM_Op<"blockstore2d">, let hasVerifier = 1; } +def XeVM_MemoryScope : I32EnumAttr<"MemoryScope", "Memory scope for memory operations", + [ + I32EnumAttrCase<"WORKGROUP", 0, "workgroup">, + I32EnumAttrCase<"CLUSTER", 1, "cluster">, + I32EnumAttrCase<"GPU", 2, "gpu">, + I32EnumAttrCase<"SYSTEM", 3, "system"> + ]>{ + let cppNamespace = "mlir::xevm"; + let genSpecializedAttr = 0; +} + +def XeVM_MemoryScopeAttr: + EnumAttr { + let summary = [{Describes the memory visibility scope: + "workgroup" - All work-items in the same work-group. + "cluster" - All work-items in the same cluster (a group of workgroups sharing SLM). + "gpu" - All work-items in the global NDrange. + "system" - All work-items in the global NDrange and the host program. }]; + let assemblyFormat = "$value"; +} + +def XeVM_AddrSpace : I32EnumAttr<"AddrSpace", "Address spaces", + [ + I32EnumAttrCase<"SHARED", 0, "shared">, + I32EnumAttrCase<"GLOBAL", 1, "global">, + I32EnumAttrCase<"GENERIC", 2, "generic"> + ]>{ + let cppNamespace = "mlir::xevm"; + let genSpecializedAttr = 0; +} + +def XeVM_AddrSpaceAttr: + EnumAttr { + let summary = [{Specifies the address space for memory operations affected by a fence: + "shared" - workgroup (SLM). + "global" - GPU. + "generic" - both "shared" and "global".}]; + let assemblyFormat = "$value"; +} + +def XeVM_MemfenceOp : XeVM_Op<"memfence">, + Arguments<(ins + XeVM_MemoryScopeAttr:$scope, + DefaultValuedAttr :$addrspace + )> { + let summary = "Work-item's memory fence."; + let description = [{ + This operation ensures that all prior memory accesses of this + work-item to `addrspace` are visible to all other work-items in `scope`. + Parameters description: + $scope - specify the memory scope at which all other work-items should observe + memory operations prior to the fence. + $addrspace - specify the address space of work-item's memory accesses + to be affected by the fence. + }]; + let assemblyFormat = [{`addrspace` `=` `` $addrspace `,` `scope` `=` `` $scope attr-dict}]; +} + +def XeVM_PrefetchOp : XeVM_Op<"prefetch">, + Arguments<(ins + Arg:$ptr, + XeVM_AddrSpaceAttr:$addrspace, + DefaultValuedAttr:$l1_cache_control, + DefaultValuedAttr:$l3_cache_control + )> { + let summary = "Prefetch data into a cache subsystem."; + let description = [{ + Work-item issues a prefetch from global memory to L1/L3 cache: + $ptr - memory pointer. + $addrspace - address space of a pointer, must be generic or global. + $cache_control - specify caching options (e.g., L1c, L3uc). + }]; + let assemblyFormat = [{ + operands ` ` `{` `addrspace` `=` $addrspace `,` `l1_cc` `=` $l1_cache_control `,` `l3_cc` `=` $l3_cache_control `}` + attr-dict `:` `(` type(operands) `)` + }]; + + // let hasVerifier = 1; +} + def XeVM_BlockPrefetch2dOp : XeVM_Op<"blockprefetch2d">, Arguments<(ins Arg:$ptr, @@ -186,8 +274,8 @@ def XeVM_BlockPrefetch2dOp : XeVM_Op<"blockprefetch2d">, I32Attr:$tile_width, I32Attr:$tile_height, I32Attr:$v_blocks, - DefaultValuedAttr:$l1_cache_control, - DefaultValuedAttr:$l3_cache_control + DefaultValuedAttr:$l1_cache_control, + DefaultValuedAttr:$l3_cache_control )> { let summary = "2D block prefetch"; @@ -242,8 +330,8 @@ def XeVM_PrecisionTypeAttr : I32EnumAttr<"PrecisionType", let cppNamespace = "::mlir::xevm"; } -def XeVM_DPASOp : XeVM_Op<"dpas">, - Results<(outs FixedVectorOf<[XeVM_MatrixElemType]>:$d)>, +def XeVM_DpasOp : XeVM_Op<"dpas">, + Results<(outs FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$d)>, Arguments<(ins FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$c, FixedVectorOfRankAndType<[1], [XeVM_MatrixElemType]>:$a, diff --git a/include/gc/Transforms/Microkernel/BrgemmRuntimeUtils.h b/include/gc/Transforms/Microkernel/BrgemmRuntimeUtils.h index 0c92458ed..9c0ba87db 100644 --- a/include/gc/Transforms/Microkernel/BrgemmRuntimeUtils.h +++ b/include/gc/Transforms/Microkernel/BrgemmRuntimeUtils.h @@ -27,13 +27,13 @@ static inline int64_t getDnnlDataTypeVal(RewriterBase &rewriter, auto context = rewriter.getContext(); auto tattr = dyn_cast_or_null(attr); assert(tattr); - if (tattr == TypeAttr::get(FloatType::getF32(context))) { + if (tattr == TypeAttr::get(Float32Type::get(context))) { return static_cast(dnnl_f32); - } else if (tattr == TypeAttr::get(FloatType::getF64(context))) { + } else if (tattr == TypeAttr::get(Float64Type::get(context))) { return static_cast(dnnl_f64); - } else if (tattr == TypeAttr::get(FloatType::getBF16(context))) { + } else if (tattr == TypeAttr::get(BFloat16Type::get(context))) { return static_cast(dnnl_bf16); - } else if (tattr == TypeAttr::get(FloatType::getF16(context))) { + } else if (tattr == TypeAttr::get(Float16Type::get(context))) { return static_cast(dnnl_f16); } else if (tattr == TypeAttr::get( IntegerType::get(context, 32, IntegerType::Signed))) { diff --git a/include/gc/Transforms/Utils/StructuredOpMatcher.h b/include/gc/Transforms/Utils/StructuredOpMatcher.h index 66d398474..131888b1b 100644 --- a/include/gc/Transforms/Utils/StructuredOpMatcher.h +++ b/include/gc/Transforms/Utils/StructuredOpMatcher.h @@ -163,7 +163,7 @@ struct HasStaticStrides { SmallVector strides; if (auto memRefType = dyn_cast_or_null(operandType)) { int64_t offset; - if (failed(getStridesAndOffset(memRefType, strides, offset))) + if (failed(memRefType.getStridesAndOffset(strides, offset))) return false; if (llvm::any_of(strides, [](int64_t stride) { return stride == ShapedType::kDynamic; @@ -244,7 +244,8 @@ struct NumDpsInits { // Callable object to validate number of input operands for `op`. struct NumDpsInputs { NumDpsInputs() = delete; - explicit NumDpsInputs(std::function fun) : fun(std::move(fun)){}; + explicit NumDpsInputs(std::function fun) + : fun(std::move(fun)){}; bool operator()(Operation *op) { if (auto linalgOp = dyn_cast_or_null(op)) diff --git a/lib/gc/Conversion/CMakeLists.txt b/lib/gc/Conversion/CMakeLists.txt index 30ebfa286..39fb38945 100644 --- a/lib/gc/Conversion/CMakeLists.txt +++ b/lib/gc/Conversion/CMakeLists.txt @@ -1 +1,2 @@ add_subdirectory(XeVMToLLVM) +add_subdirectory(XeGPUToXeVM) diff --git a/lib/gc/Conversion/XeGPUToXeVM/CMakeLists.txt b/lib/gc/Conversion/XeGPUToXeVM/CMakeLists.txt new file mode 100644 index 000000000..b3a9a456b --- /dev/null +++ b/lib/gc/Conversion/XeGPUToXeVM/CMakeLists.txt @@ -0,0 +1,24 @@ +gc_add_mlir_conversion_library(MLIRXeGPUToXeVM + XeGPUToXeVM.cpp + + ADDITIONAL_HEADER_DIRS + ${PROJECT_SOURCE_DIR}/include/gc/Conversion/XeGPUToXeVM + + DEPENDS + GCConversionPassIncGen + + LINK_COMPONENTS + Core + + LINK_LIBS PUBLIC + MLIRFuncDialect + MLIRGPUDialect + MLIRLLVMCommonConversion + MLIRLLVMDialect + MLIRXeVMDialect + MLIRVectorDialect + MLIRArithDialect + MLIRXeGPUDialect + MLIRPass + MLIRTransforms +) \ No newline at end of file diff --git a/lib/gc/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp b/lib/gc/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp new file mode 100644 index 000000000..6fff77c81 --- /dev/null +++ b/lib/gc/Conversion/XeGPUToXeVM/XeGPUToXeVM.cpp @@ -0,0 +1,649 @@ +//===-- XeVMToLLVM.cpp - XeVM to LLVM dialect conversion --------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "gc/Conversion/XeGPUToXeVM/XeGPUToXeVM.h" +#include "gc/Dialect/LLVMIR/XeVMDialect.h" + +#include "mlir/Conversion/LLVMCommon/Pattern.h" +#include "mlir/Dialect/Arith/IR/Arith.h" +#include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h" +#include "mlir/Dialect/LLVMIR/LLVMDialect.h" +#include "mlir/Dialect/MemRef/IR/MemRef.h" +#include "mlir/Dialect/XeGPU/IR/XeGPU.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Support/LLVM.h" +#include "llvm/Support/FormatVariadic.h" + +#include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/Types.h" + +#include "llvm/ADT/TypeSwitch.h" + +#define DEBUG_TYPE "xegpu-to-xevm" + +namespace mlir { +#define GEN_PASS_DEF_CONVERTXEGPUTOXEVMPASS +#include "gc/Conversion/Passes.h.inc" +} // namespace mlir + +using namespace mlir; +using namespace xegpu; + +namespace { + +enum class NdDescI32Layout : uint32_t { + BasePtr = 0, + BaseShapeW = 2, + BaseShapeH = 3, + TensorOffsetW = 4, + TensorOffsetH = 5 +}; + +static int32_t getNumericXeVMAddrSpace(xegpu::MemorySpace xeGpuMemspace) { + switch (xeGpuMemspace) { + case xegpu::MemorySpace::Global: + return static_cast(mlir::xevm::XeVMMemorySpace::kCrossWorkgroup); + case xegpu::MemorySpace::SLM: + return static_cast(mlir::xevm::XeVMMemorySpace::kWorkgroup); + } + llvm_unreachable("Unknown XeGPU memory space."); +} + +template +std::tuple checkAllLinear(SmallVector denseAttr) { + assert(!denseAttr.empty()); + const int32_t intercept{static_cast(denseAttr[0])}; + if (denseAttr.size() < 2) + return {true, 0, intercept}; + const T slope{denseAttr[1] - denseAttr[0]}; + for (size_t i = 1; i < denseAttr.size(); ++i) + if (denseAttr[i] - denseAttr[i - 1] != slope) + return {false, 0, 0}; + return {true, static_cast(slope), intercept}; +} + +mlir::VectorType encodeVectorTypeTo(mlir::VectorType currentVecType, + mlir::Type toElemType) { + auto elemType = currentVecType.getElementType(); + auto currentBitWidth = elemType.getIntOrFloatBitWidth(); + auto newBitWidth = toElemType.getIntOrFloatBitWidth(); + const int size = + currentVecType.getNumElements() * currentBitWidth / newBitWidth; + return mlir::VectorType::get(size, toElemType); +} + +xevm::LoadCacheControl +translateLoadXeGPUCacheHint(std::optional hint) { + auto hintVal = hint.has_value() ? hint.value() : xegpu::CachePolicy::UNCACHED; + switch (hintVal) { + case xegpu::CachePolicy::CACHED: + return xevm::LoadCacheControl::C; + case xegpu::CachePolicy::UNCACHED: + return xevm::LoadCacheControl::UC; + case xegpu::CachePolicy::STREAMING: + return xevm::LoadCacheControl::S; + case xegpu::CachePolicy::READ_INVALIDATE: + return xevm::LoadCacheControl::IAR; + llvm_unreachable("Unsupported cache control."); + } +}; + +xevm::StoreCacheControl +translateStoreXeGPUCacheHint(std::optional hint) { + auto hintVal = hint.has_value() ? hint.value() : xegpu::CachePolicy::UNCACHED; + switch (hintVal) { + case xegpu::CachePolicy::UNCACHED: + return xevm::StoreCacheControl::UC; + case xegpu::CachePolicy::STREAMING: + return xevm::StoreCacheControl::S; + case xegpu::CachePolicy::WRITE_BACK: + return xevm::StoreCacheControl::WB; + case xegpu::CachePolicy::WRITE_THROUGH: + return xevm::StoreCacheControl::WT; + llvm_unreachable("Unsupported cache control."); + } +}; + +class CreateNdDescToXeVMPattern : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(CreateNdDescOp op, CreateNdDescOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto ctxt = rewriter.getContext(); + auto resultDesc = cast(op.getResult().getType()); + auto sgMap = resultDesc.getSGMapAttr(); + if (!sgMap) { + op.emitError() << "XeVM expects SGMap attribute to be present for tensor " + "descriptors"; + return mlir::failure(); + } + auto source = op.getSource(); + Type payloadElemTy = rewriter.getI32Type(); + Type i64Ty = rewriter.getI64Type(); + VectorType payloadTy = VectorType::get(8, payloadElemTy); + VectorType payloadI64Ty = VectorType::get(4, i64Ty); + Value payload = rewriter.create( + loc, + DenseElementsAttr::get(payloadTy, IntegerAttr::get(payloadElemTy, 0))); + + Value baseAddr; + Value baseShapeW; + Value baseShapeH; + Value offsetW; + Value offsetH; + + if (auto sourceTy = source.getType(); isa(sourceTy)) { + baseAddr = + rewriter.create(loc, source); + auto sourceMemrefTy = cast(sourceTy); + if (!sourceMemrefTy.hasStaticShape()) { + op.emitError() << "Expected static memref shape."; + return mlir::failure(); + } + auto rank = sourceMemrefTy.getRank(); + if (rank != 2) { + op.emitError() << "Expected a 2D memref."; + return mlir::failure(); + } + auto createOffset = [&](unsigned idx) -> Value { + Value val; + OpFoldResult ofr = op.getMixedOffsets()[idx]; + if (auto v = llvm::dyn_cast_if_present(ofr)) { + val = + rewriter.create(loc, i64Ty, ofr.get()); + val = rewriter.create(loc, payloadElemTy, val); + } else { + int32_t off = llvm::cast(ofr.get()).getInt(); + val = rewriter.create(loc, off, payloadElemTy); + } + return val; + }; + offsetW = createOffset(rank - 1); + offsetH = createOffset(rank - 2); + baseShapeW = rewriter.create( + loc, sourceMemrefTy.getDimSize(rank - 1), payloadElemTy); + baseShapeH = rewriter.create( + loc, sourceMemrefTy.getDimSize(rank - 2), payloadElemTy); + } else if (isa(sourceTy)) { + op.emitError() + << "Integer as source are currently not supported by the pass."; + return mlir::failure(); + } else { + op.emitError() << "Unknown source type."; + return mlir::failure(); + } + + baseAddr = rewriter.create(loc, i64Ty, baseAddr); + Value payLoadAsI64 = + rewriter.create(loc, payloadI64Ty, payload); + payLoadAsI64 = rewriter.create( + loc, baseAddr, payLoadAsI64, + static_cast(NdDescI32Layout::BasePtr)); + payload = rewriter.create(loc, payloadTy, payLoadAsI64); + payload = rewriter.create( + loc, baseShapeW, payload, + static_cast(NdDescI32Layout::BaseShapeW)); + payload = rewriter.create( + loc, baseShapeH, payload, + static_cast(NdDescI32Layout::BaseShapeH)); + payload = rewriter.create( + loc, offsetW, payload, + static_cast(NdDescI32Layout::TensorOffsetW)); + payload = rewriter.create( + loc, offsetH, payload, + static_cast(NdDescI32Layout::TensorOffsetH)); + rewriter.replaceOp(op, payload); + return success(); + } +}; + +class UpdateNdOffsetToXeVMPattern + : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(UpdateNdOffsetOp op, UpdateNdOffsetOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto ctxt = rewriter.getContext(); + auto offsets = op.getOffsets(); + auto tdesc = adaptor.getTensorDesc(); + for (size_t offsetDim = 0; offsetDim < offsets.size(); offsetDim++) { + auto offset = offsets[offsetDim]; + if (auto cst = + dyn_cast_if_present(offset.getDefiningOp())) + if (auto attr = dyn_cast_if_present(cst.getValue()); + attr && !attr.getInt()) + continue; + const int offsetPos = + static_cast(offsetDim ? NdDescI32Layout::TensorOffsetW + : NdDescI32Layout::TensorOffsetH); + auto oldOffset = + rewriter.create(loc, tdesc, offsetPos); + offset = rewriter.create(loc, rewriter.getI32Type(), + offset); + auto newOffset = rewriter.create(loc, oldOffset, offset); + tdesc = + rewriter.create(loc, newOffset, tdesc, offsetPos); + } + rewriter.replaceOp(op, tdesc); + return success(); + } +}; + +template ::value>> +class LoadStorePrefetchNdToXeVMPattern : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(OpType op, typename OpType::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto ctxt = rewriter.getContext(); + + auto tdesc = adaptor.getTensorDesc(); + auto tdescTy = op.getTensorDescType(); + + VectorType payloadI64Ty = VectorType::get(4, rewriter.getI64Type()); + VectorType payloadI32Ty = VectorType::get(8, rewriter.getI32Type()); + Value payLoadAsI64 = + rewriter.create(loc, payloadI64Ty, tdesc); + Value basePtr = rewriter.create( + loc, payLoadAsI64, static_cast(NdDescI32Layout::BasePtr)); + + Value baseShapeW = rewriter.create( + loc, tdesc, static_cast(NdDescI32Layout::BaseShapeW)); + Value baseShapeH = rewriter.create( + loc, tdesc, static_cast(NdDescI32Layout::BaseShapeH)); + Value offsetW = rewriter.create( + loc, tdesc, static_cast(NdDescI32Layout::TensorOffsetW)); + Value offsetH = rewriter.create( + loc, tdesc, static_cast(NdDescI32Layout::TensorOffsetH)); + auto ptrTypeLLVM = LLVM::LLVMPointerType::get( + ctxt, getNumericXeVMAddrSpace(tdescTy.getMemorySpace())); + Value basePtrLLVM = + rewriter.create(loc, ptrTypeLLVM, basePtr); + auto elemType = tdescTy.getElementType(); + const uint32_t elemBitSize = elemType.getIntOrFloatBitWidth(); + Value elemByteSize = rewriter.create( + loc, elemBitSize / 8, rewriter.getI32Type()); + Value surfaceW = + rewriter.create(loc, baseShapeW, elemByteSize); + + auto tileW = tdescTy.getDimSize(1); + auto tileH = tdescTy.getDimSize(0); + int32_t vblocks = 1; + if (elemBitSize == 16) { + vblocks = (tileW + 16 - 1) / 16; + tileW = 16; + } + + if constexpr (std::is_same_v) { + VectorType srcVecTy = cast(op.getValue().getType()); + auto l1 = translateStoreXeGPUCacheHint(op.getL1Hint()); + auto l3 = translateStoreXeGPUCacheHint(op.getL3Hint()); + VectorType srcFlatVecTy = + VectorType::get(srcVecTy.getNumElements(), srcVecTy.getElementType()); + Value srcFlatVec = rewriter.create(loc, srcFlatVecTy, + op.getValue()); + srcFlatVecTy = encodeVectorTypeTo(srcFlatVecTy, + rewriter.getIntegerType(elemBitSize)); + srcFlatVec = + rewriter.create(loc, srcFlatVecTy, srcFlatVec); + rewriter.create( + loc, basePtrLLVM, surfaceW, baseShapeH, surfaceW, offsetW, offsetH, + elemBitSize, tileW, tileH, vblocks, srcFlatVec, l1, l3); + rewriter.eraseOp(op); + } else { + auto l1 = translateLoadXeGPUCacheHint(op.getL1Hint()); + auto l3 = translateLoadXeGPUCacheHint(op.getL3Hint()); + if constexpr (std::is_same_v) { + rewriter.create( + loc, basePtrLLVM, surfaceW, baseShapeH, surfaceW, offsetW, offsetH, + elemBitSize, tileW, tileH, vblocks, l1, l3); + rewriter.eraseOp(op); + } else { + VectorType dstVecTy = cast(op.getValue().getType()); + const bool vnni = op.getPacked().value_or(false); + auto transposeValue = op.getTranspose(); + bool transpose = + transposeValue.has_value() && transposeValue.value()[0] == 1; + VectorType loadedTy = encodeVectorTypeTo( + dstVecTy, vnni ? rewriter.getI32Type() + : rewriter.getIntegerType(elemBitSize)); + + Value resultFlatVec = rewriter.create( + loc, loadedTy, basePtrLLVM, surfaceW, baseShapeH, surfaceW, offsetW, + offsetH, elemBitSize, tileW, tileH, vblocks, transpose, vnni, l1, + l3); + resultFlatVec = rewriter.create( + loc, encodeVectorTypeTo(loadedTy, dstVecTy.getElementType()), + resultFlatVec); + auto newOp = + rewriter.create(loc, dstVecTy, resultFlatVec); + rewriter.replaceOp(op, newOp); + } + } + return success(); + } +}; + +class CreateDescToXeVMPattern : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(CreateDescOp op, CreateDescOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto ctxt = rewriter.getContext(); + auto offsets = op.getOffsets(); + bool allLinear{false}; + int32_t slope{0}; + int32_t intercept{0}; + if (auto cstOp = dyn_cast(offsets.getDefiningOp())) { + if (auto denseAttr = cstOp->getAttrOfType( + cstOp.getValueAttrName())) { + SmallVector intValues; + for (APInt val : denseAttr.getValues()) + intValues.push_back(static_cast(val.getSExtValue())); + std::tie(allLinear, slope, intercept) = checkAllLinear(intValues); + } else { + op.emitError() << "Unknown offsets source, expected a dense array."; + return failure(); + } + } else { + op.emitError() + << "Unknown offsets source, must be a compile-time constant array."; + return failure(); + } + if (!allLinear) { + op.emitError() << "Expected linear offsets pattern."; + return failure(); + } + + auto memrefTy = cast(op.getSource().getType()); + Value subGroupAddr = + rewriter.create(loc, + op.getSource()); + Value elemByteWidth = rewriter.create( + loc, memrefTy.getElementTypeBitWidth() / 8); + Value offsetIntercept = + rewriter.create(loc, intercept); + offsetIntercept = + rewriter.create(loc, elemByteWidth, offsetIntercept); + Value offsetSlope = rewriter.create(loc, slope); + offsetSlope = + rewriter.create(loc, elemByteWidth, offsetSlope); + Value sgSize = rewriter.create( + loc, 16); // LaneIdOp doesn't work in llvm-spv + Value threadId = rewriter.create(loc, gpu::Dimension::x); + Value laneId = rewriter.create(loc, threadId, sgSize); + Value laneOffset = rewriter.create(loc, laneId, offsetSlope); + laneOffset = + rewriter.create(loc, laneOffset, offsetIntercept); + auto laneAddr = + rewriter.create(loc, subGroupAddr, laneOffset); + rewriter.replaceOp(op, laneAddr); + return success(); + } +}; + +class UpdateOffsetToXeVMPattern : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(UpdateOffsetOp op, UpdateOffsetOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto ctxt = rewriter.getContext(); + auto elemByteSize = + op.getTensorDesc().getType().getElementType().getIntOrFloatBitWidth() / + 8; + // Value laneId = rewriter.create(loc, + // /*upperBound=*/nullptr); + Value sgSize = rewriter.create(loc, 16); + Value threadId = rewriter.create(loc, gpu::Dimension::x); + Value laneId = rewriter.create(loc, threadId, sgSize); + Value offsetForLane = + rewriter.create(loc, adaptor.getOffsets(), laneId); + Value factor = rewriter.create(loc, elemByteSize); + offsetForLane = rewriter.create( + loc, rewriter.getIndexType(), offsetForLane); + offsetForLane = rewriter.create(loc, factor, offsetForLane); + Value newOffsetForLane = rewriter.create( + loc, adaptor.getTensorDesc(), offsetForLane); + rewriter.replaceOp(op, newOffsetForLane); + return success(); + } +}; + +template ::value>> +class LoadStoreToXeVMPattern : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(OpType op, typename OpType::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto ctxt = rewriter.getContext(); + auto tdesc = op.getTensorDescType(); + auto ptrTypeLLVM = LLVM::LLVMPointerType::get( + ctxt, getNumericXeVMAddrSpace(tdesc.getMemorySpace())); + Value basePtrI64 = rewriter.create( + loc, rewriter.getI64Type(), adaptor.getTensorDesc()); + Value basePtrLLVM = + rewriter.create(loc, ptrTypeLLVM, basePtrI64); + VectorType srcOrDstVecTy = cast(op.getValue().getType()); + VectorType srcOrDstFlatVecTy = VectorType::get( + srcOrDstVecTy.getNumElements(), srcOrDstVecTy.getElementType()); + if constexpr (std::is_same_v) { + Value loaded = + rewriter.create(loc, srcOrDstFlatVecTy, basePtrLLVM); + auto newOp = + rewriter.create(loc, srcOrDstVecTy, loaded); + rewriter.replaceOp(op, newOp); + } else { + Value srcFlatVec = rewriter.create( + loc, srcOrDstFlatVecTy, op.getValue()); + rewriter.create(loc, srcFlatVec, basePtrLLVM); + rewriter.eraseOp(op); + } + return success(); + } +}; + +class PrefetchToXeVMPattern : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(PrefetchOp op, PrefetchOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto ctxt = rewriter.getContext(); + auto tdescTy = op.getTensorDescType(); + auto ptrTypeLLVM = LLVM::LLVMPointerType::get( + ctxt, getNumericXeVMAddrSpace(tdescTy.getMemorySpace())); + Value basePtrI64 = rewriter.create( + loc, rewriter.getI64Type(), adaptor.getTensorDesc()); + Value ptrLLVM = + rewriter.create(loc, ptrTypeLLVM, basePtrI64); + rewriter.create( + loc, ptrLLVM, xevm::AddrSpace::GLOBAL, + translateLoadXeGPUCacheHint(op.getL1Hint()), + translateLoadXeGPUCacheHint(op.getL3Hint())); + } +}; +class FenceToXeVMPattern : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(FenceOp op, FenceOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto ctxt = rewriter.getContext(); + xevm::MemoryScope memScope; + switch (op.getFenceScope()) { + case xegpu::FenceScope::Workgroup: + memScope = xevm::MemoryScope::WORKGROUP; + break; + case xegpu::FenceScope::GPU: + memScope = xevm::MemoryScope::GPU; + break; + llvm_unreachable("Unknown XeGPU fence scope."); + } + xevm::AddrSpace addrSpace; + switch (op.getMemoryKind()) { + case xegpu::MemorySpace::Global: + addrSpace = xevm::AddrSpace::GLOBAL; + break; + case xegpu::MemorySpace::SLM: + addrSpace = xevm::AddrSpace::SHARED; + break; + llvm_unreachable("Unknown XeGPU fence scope."); + } + rewriter.create(loc, memScope, addrSpace); + rewriter.eraseOp(op); + return success(); + } +}; + +class DpasToXeVMPattern : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(DpasOp op, DpasOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + auto ctxt = rewriter.getContext(); + auto aTy = mlir::cast(op.getLhs().getType()); + auto bTy = mlir::cast(op.getRhs().getType()); + auto resultType = mlir::cast(op.getResultType()); + + auto encodePrecision = [&](Type type) -> xevm::PrecisionType { + if (type == rewriter.getBF16Type()) + return xevm::PrecisionType::BF16; + else if (type == rewriter.getF16Type()) + return xevm::PrecisionType::FP16; + else if (type == rewriter.getTF32Type()) + return xevm::PrecisionType::TF32; + else if (type.isInteger(8)) { + if (type.isUnsignedInteger()) + return xevm::PrecisionType::U8; + return xevm::PrecisionType::S8; + } + llvm_unreachable("add more support for PrecisionType"); + return xevm::PrecisionType::UNUSED; + }; + xevm::PrecisionType precATy = encodePrecision(aTy.getElementType()); + xevm::PrecisionType precBTy = encodePrecision(bTy.getElementType()); + auto precA = xevm::PrecisionTypeAttr::get(ctxt, precATy); + auto precB = xevm::PrecisionTypeAttr::get(ctxt, precBTy); + Value c = op.getAcc(); + if (!c) { + auto elementTy = resultType.getElementType(); + Attribute initValueAttr; + if (isa(elementTy)) + initValueAttr = FloatAttr::get(elementTy, 0.0); + else + initValueAttr = IntegerAttr::get(elementTy, 0); + c = rewriter.create( + loc, DenseElementsAttr::get(resultType, initValueAttr)); + } + auto rc = IntegerAttr::get(rewriter.getI32Type(), 8); + + VectorType aNty = + VectorType::get(aTy.getNumElements(), aTy.getElementType()); + Value aVec = rewriter.create(loc, aNty, op.getLhs()); + + VectorType bNty = + VectorType::get(bTy.getNumElements(), bTy.getElementType()); + Value bVec = rewriter.create(loc, bNty, op.getRhs()); + + auto cvecty = cast(c.getType()); + VectorType cNty = + VectorType::get(cvecty.getNumElements(), cvecty.getElementType()); + if (cvecty != cNty) + c = rewriter.create(loc, cNty, c); + Value dpasRes = rewriter.create(loc, cNty, c, aVec, bVec, + precA, precB, rc); + if (cvecty != cNty) + dpasRes = rewriter.create(loc, resultType, dpasRes); + rewriter.replaceOp(op, dpasRes); + return success(); + } +}; + +//===----------------------------------------------------------------------===// +// Pass Definition +//===----------------------------------------------------------------------===// + +struct ConvertXeGPUToXeVMPass + : public impl::ConvertXeGPUToXeVMPassBase { + using Base::Base; + + void getDependentDialects(DialectRegistry ®istry) const override { + registry.insert(); + } + + void runOnOperation() override { + LLVMTypeConverter typeConverter(&getContext()); + typeConverter.addConversion([&](IndexType type) -> Type { return type; }); + typeConverter.addConversion([&](VectorType type) -> Type { + unsigned rank = type.getRank(); + auto elemType = type.getElementType(); + if (llvm::isa(elemType)) + elemType = mlir::IntegerType::get(&getContext(), 64); + if (rank < 1 || type.getNumElements() == 1) + return elemType; + unsigned sum = 1; + for (unsigned i = 0; i < rank; i++) { + sum *= type.getShape()[i]; + } + return VectorType::get(sum, elemType); + }); + typeConverter.addConversion([&](xegpu::TensorDescType type) -> Type { + if (type.isScattered()) { + return IndexType::get(&getContext()); + } + auto i32Type = IntegerType::get(&getContext(), 32); + return VectorType::get(8, i32Type); + }); + + ConversionTarget target(getContext()); + target.addLegalDialect(); + target.addIllegalDialect(); + + RewritePatternSet patterns(&getContext()); + populateXeGPUToXeVMConversionPatterns(patterns, typeConverter); + if (failed(applyPartialConversion(getOperation(), target, + std::move(patterns)))) + signalPassFailure(); + } +}; +} // namespace + +//===----------------------------------------------------------------------===// +// Pattern Population +//===----------------------------------------------------------------------===// +namespace mlir { +void populateXeGPUToXeVMConversionPatterns(RewritePatternSet &patterns, + LLVMTypeConverter &typeConverter) { + patterns.add, + LoadStorePrefetchNdToXeVMPattern, + LoadStorePrefetchNdToXeVMPattern>( + typeConverter, patterns.getContext()); + patterns.add, + LoadStoreToXeVMPattern>( + typeConverter, patterns.getContext()); + patterns.add( + typeConverter, patterns.getContext()); +} +} // namespace mlir diff --git a/lib/gc/Conversion/XeVMToLLVM/XeVMToLLVM.cpp b/lib/gc/Conversion/XeVMToLLVM/XeVMToLLVM.cpp index 6d20b7c3d..b44af2f5a 100644 --- a/lib/gc/Conversion/XeVMToLLVM/XeVMToLLVM.cpp +++ b/lib/gc/Conversion/XeVMToLLVM/XeVMToLLVM.cpp @@ -107,24 +107,18 @@ std::string mangle(StringRef baseName, ArrayRef types, return os.str(); } -template +template static std::optional getCacheControlMetadata(ConversionPatternRewriter &rewriter, OpType op, - const bool isLoad, const std::string &chip) { - if ((op.getL1CacheControlAttr() == - L1StoreCacheControlAttr::get(rewriter.getContext(), - L1StoreCacheControl::DEFAULT) && - op.getL3CacheControlAttr() == - L3StoreCacheControlAttr::get(rewriter.getContext(), - L3StoreCacheControl::DEFAULT)) || - - (op.getL1CacheControlAttr() == - L1LoadCacheControlAttr::get(rewriter.getContext(), - L1LoadCacheControl::DEFAULT) && - op.getL3CacheControlAttr() == - L3LoadCacheControlAttr::get(rewriter.getContext(), - L3LoadCacheControl::DEFAULT))) { - return {}; + const std::string &chip) { + if constexpr (isLoad) { + if (op.getL1CacheControl() == LoadCacheControl::DEFAULT && + op.getL3CacheControl() == LoadCacheControl::DEFAULT) + return {}; + } else { + if (op.getL1CacheControl() == StoreCacheControl::DEFAULT && + op.getL3CacheControl() == StoreCacheControl::DEFAULT) + return {}; } constexpr int32_t decorationCacheControlArity{4}; constexpr int32_t loadCacheControlKey{6442}; @@ -155,8 +149,10 @@ static LLVM::CallOp createDeviceFunctionCall( MLIRContext *ctx = rewriter.getContext(); Location loc = UnknownLoc::get(ctx); - LLVM::LLVMFuncOp funcOp = + auto funcOpRes = LLVM::lookupOrCreateFn(moduleOp, funcName, argTypes, retType); + assert(!failed(funcOpRes)); + LLVM::LLVMFuncOp funcOp = funcOpRes.value(); funcOp.setCConv(LLVM::cconv::CConv::SPIR_FUNC); funcOp.setConvergent(funcAttributeOptions.isConvergent); funcOp.setNoUnwind(funcAttributeOptions.isNoUnwind); @@ -174,10 +170,10 @@ static LLVM::CallOp createDeviceFunctionCall( return callOp; } -class DPASToOCLPattern : public OpConversionPattern { +class DpasToOCLPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; LogicalResult - matchAndRewrite(xevm::DPASOp op, xevm::DPASOp::Adaptor adaptor, + matchAndRewrite(xevm::DpasOp op, xevm::DpasOp::Adaptor adaptor, ConversionPatternRewriter &rewriter) const override { constexpr uint32_t bitWidthPackedA{16}; constexpr uint32_t bitWidthPackedB{32}; @@ -264,6 +260,83 @@ class DPASToOCLPattern : public OpConversionPattern { } }; +class PrefetchToOCLPattern : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(PrefetchOp op, PrefetchOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + const std::string fnName{"_Z8prefetchPU3AS1Kcm"}; + Value one = rewriter.create( + loc, rewriter.getI64Type(), rewriter.getI64IntegerAttr(1)); + SmallVector args{op.getPtr(), one}; + SmallVector argTypes; + for (auto arg : args) + argTypes.push_back(arg.getType()); + auto funcAttr = noUnwindAttrs; + auto memAttr = rewriter.getAttr( + /*other=*/LLVM::ModRefInfo::NoModRef, + /*argMem=*/LLVM::ModRefInfo::Ref, + /*inaccessibleMem=*/LLVM::ModRefInfo::NoModRef); + funcAttr.memEffectsAttr = memAttr; + + const std::string chip{"pvc"}; + LLVM::CallOp call = createDeviceFunctionCall( + rewriter, fnName, LLVM::LLVMVoidType::get(rewriter.getContext()), + argTypes, args, {}, funcAttr); + if (std::optional optCacheControls = + getCacheControlMetadata(rewriter, op, chip)) + call->setAttr(XeVMDialect::getCacheControlsAttrName(), *optCacheControls); + rewriter.eraseOp(op); + } +}; + +class MemfenceToOCLPattern : public OpConversionPattern { + using OpConversionPattern::OpConversionPattern; + LogicalResult + matchAndRewrite(MemfenceOp op, MemfenceOp::Adaptor adaptor, + ConversionPatternRewriter &rewriter) const override { + auto loc = op.getLoc(); + const std::string fnName{"atomic_work_item_fence"}; + int memScopeOcl, addrSpaceOcl; + switch (op.getAddrspace()) { + case xevm::AddrSpace::SHARED: + addrSpaceOcl = 1; + break; + case xevm::AddrSpace::GLOBAL: + addrSpaceOcl = 2; + break; + default: + // GENERIC is not supported in OpenCL + llvm_unreachable("unsupported xevm::FenceAddrSpace"); + } + switch (op.getScope()) { + case xevm::MemoryScope::WORKGROUP: + memScopeOcl = 1; + break; + case xevm::MemoryScope::GPU: + memScopeOcl = 2; + break; + default: + // CLUSTER and SYSTEM are not supported in OpenCL + llvm_unreachable("unsupported xevm::MemoryScope"); + } + Value acqRel = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(4)); + Value memScope = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(memScopeOcl)); + Value addrSpace = rewriter.create( + loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(addrSpaceOcl)); + SmallVector args{addrSpace, acqRel, memScope}; + SmallVector argTypes{3, rewriter.getI32Type()}; + LLVM::CallOp call = + createDeviceFunctionCall(rewriter, mangle(fnName, argTypes), + LLVM::LLVMVoidType::get(rewriter.getContext()), + argTypes, args, {}, noUnwindAttrs); + rewriter.eraseOp(op); + return success(); + } +}; template class LoadStorePrefetchToOCLPattern : public OpConversionPattern { using OpConversionPattern::OpConversionPattern; @@ -311,8 +384,8 @@ class LoadStorePrefetchToOCLPattern : public OpConversionPattern { /*other=*/LLVM::ModRefInfo::NoModRef, /*argMem=*/LLVM::ModRefInfo::Ref, /*inaccessibleMem=*/LLVM::ModRefInfo::NoModRef); - auto funcAttrs = noUnwindAttrs; - funcAttrs.memEffectsAttr = memAttr; + funcAttr = noUnwindAttrs; + funcAttr.memEffectsAttr = memAttr; } else { auto vecElemType = vecType.getElementType(); auto vecElemBitWidth = vecElemType.getIntOrFloatBitWidth(); @@ -369,7 +442,8 @@ class LoadStorePrefetchToOCLPattern : public OpConversionPattern { // TODO: extract chip from the attached target const std::string chip{"pvc"}; if (std::optional optCacheControls = - getCacheControlMetadata(rewriter, op, isLoad || isPrefetch, chip)) { + getCacheControlMetadata < isLoad || + isPrefetch > (rewriter, op, chip)) { call->setAttr(XeVMDialect::getCacheControlsAttrName(), *optCacheControls); } if constexpr (isLoad) @@ -411,11 +485,11 @@ struct ConvertXeVMToLLVMPass //===----------------------------------------------------------------------===// void mlir::populateXeVMToLLVMConversionPatterns(RewritePatternSet &patterns) { - patterns - .add, - LoadStorePrefetchToOCLPattern, - LoadStorePrefetchToOCLPattern, DPASToOCLPattern>( - patterns.getContext()); + patterns.add, + LoadStorePrefetchToOCLPattern, + LoadStorePrefetchToOCLPattern, + DpasToOCLPattern, MemfenceToOCLPattern, PrefetchToOCLPattern>( + patterns.getContext()); } //===----------------------------------------------------------------------===// diff --git a/lib/gc/Dialect/Linalgx/Utils.cpp b/lib/gc/Dialect/Linalgx/Utils.cpp index fe9096fe7..73a1c9f93 100644 --- a/lib/gc/Dialect/Linalgx/Utils.cpp +++ b/lib/gc/Dialect/Linalgx/Utils.cpp @@ -385,7 +385,7 @@ bool isGenericAttrEquivalent(linalg::GenericOp op, ShapedType shapeA, DenseMap replaceMap; std::map iterMap; // get shape-to-loop map - AffineMap inverse = inversePermutation(concatAffineMaps(inMaps)); + AffineMap inverse = inversePermutation(concatAffineMaps(inMaps, context)); assert(inverse && "shape-to-loops map to be non-null"); assert(dimSize == inverse.getResults().size()); // renumber the dim id based on shape-to-loop map @@ -492,8 +492,10 @@ bool isGenericPackedMatmulOpImpl(linalg::GenericOp genericOp, return false; } // Check for packing - ValueRange inputs = genericOp.getDpsInputs(); - ValueRange outputs = genericOp.getDpsInits(); + auto inputsVec = genericOp.getDpsInputs(); + ValueRange inputs = inputsVec; + auto outputsVec = genericOp.getDpsInits(); + ValueRange outputs = outputsVec; auto shapeA = cast(inputs.front().getType()); auto shapeB = cast(inputs.back().getType()); auto shapeC = cast(outputs.back().getType()); diff --git a/lib/gc/Dialect/Microkernel/MicrokernelOps.cpp b/lib/gc/Dialect/Microkernel/MicrokernelOps.cpp index 785a5bc03..f8fc07bee 100644 --- a/lib/gc/Dialect/Microkernel/MicrokernelOps.cpp +++ b/lib/gc/Dialect/Microkernel/MicrokernelOps.cpp @@ -551,11 +551,11 @@ static LogicalResult verifyBrgemmDataTypes(ArrayAttr dtypes, auto context = op.getContext(); -#define FTAttr(t) TypeAttr::get(FloatType::get##t(context)) +#define FTAttr(t) TypeAttr::get(t::get(context)) #define ITAttr(s, w) TypeAttr::get(IntegerType::get(context, w, IntegerType::s)) SmallVector> validDataTypes = { - {FTAttr(F32), FTAttr(F32)}, - {FTAttr(BF16), FTAttr(BF16)}, + {FTAttr(Float32Type), FTAttr(Float32Type)}, + {FTAttr(BFloat16Type), FTAttr(BFloat16Type)}, {ITAttr(Unsigned, 8), ITAttr(Signed, 8)}, {ITAttr(Signed, 8), ITAttr(Unsigned, 8)}, {ITAttr(Unsigned, 8), ITAttr(Unsigned, 8)}, diff --git a/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp b/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp index 2c48c214e..dfdf366d9 100644 --- a/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp +++ b/lib/gc/ExecutionEngine/GPURuntime/ocl/GpuOclRuntime.cpp @@ -718,7 +718,7 @@ StringRef createStaticMain(OpBuilder &builder, ModuleOp &module, auto offsetPtr = constArgs.end(); constArgs.emplace_back(0); constArgs.append(shape.begin(), shape.end()); - if (failed(getStridesAndOffset(type, constArgs, *offsetPtr))) { + if (failed(type.getStridesAndOffset(constArgs, *offsetPtr))) { gcLogD("Failed to get strides and offset of arg", i, " of the function ", funcName.begin()); return {}; @@ -929,8 +929,9 @@ OclModuleBuilder::build(const OclRuntime::Ext &ext) { builder.getI64IntegerAttr(static_cast(wgSize))); TargetDeviceSpecInterface devSpec = TargetDeviceSpecAttr::get(ctx, dltiAttrs); - auto sysSpec = - TargetSystemSpecAttr::get(ctx, ArrayRef(std::pair(devStr, devSpec))); + DataLayoutEntryInterface dl = + DataLayoutEntryAttr::get(ctx, devStr, devSpec); + auto sysSpec = TargetSystemSpecAttr::get(ctx, ArrayRef(dl)); mod = mlirModule.clone(); mod.getOperation()->setAttr("#dlti.sys_spec", sysSpec); PassManager pm{ctx}; diff --git a/lib/gc/Transforms/DecomposeAggregatedOps.cpp b/lib/gc/Transforms/DecomposeAggregatedOps.cpp index a9cf889a9..3f84a8b3a 100644 --- a/lib/gc/Transforms/DecomposeAggregatedOps.cpp +++ b/lib/gc/Transforms/DecomposeAggregatedOps.cpp @@ -42,7 +42,7 @@ struct DecomposeAggregatedOps void runOnOperation() override { RewritePatternSet patterns(getOperation().getContext()); patterns.add(patterns.getContext()); - (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); + (void)applyPatternsGreedily(getOperation(), std::move(patterns)); } }; diff --git a/lib/gc/Transforms/DecomposeTensorOperation.cpp b/lib/gc/Transforms/DecomposeTensorOperation.cpp index 3f4f4ecf9..758d97717 100644 --- a/lib/gc/Transforms/DecomposeTensorOperation.cpp +++ b/lib/gc/Transforms/DecomposeTensorOperation.cpp @@ -170,8 +170,7 @@ struct DecomposeTensorOperationPass patterns.add(patterns.getContext()); tensor::populateDecomposeTensorConcatPatterns(patterns); - if (failed(applyPatternsAndFoldGreedily(getOperation(), - std::move(patterns)))) { + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) { return signalPassFailure(); } } diff --git a/lib/gc/Transforms/DeepTileContractionOp.cpp b/lib/gc/Transforms/DeepTileContractionOp.cpp index 21de7b778..c53138f44 100644 --- a/lib/gc/Transforms/DeepTileContractionOp.cpp +++ b/lib/gc/Transforms/DeepTileContractionOp.cpp @@ -405,7 +405,7 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp, // the extra copy generated by bufferization. So remove the dummy loop // at this early stage. if (!isDummyLoop(tilingResult->loops.back())) { - b.replaceOp(currentOp, tilingResult->replacements); + b.replaceOp(currentOp, tilingResult->mergeResult.replacements); currentOp = dyn_cast(tilingResult->tiledOps.back()); if (iteratorTypes[d] == mlir::utils::IteratorType::reduction) result.reductionLoops.push_back(tilingResult->loops.back()); @@ -477,7 +477,7 @@ generateOuterLoop(RewriterBase &b, linalg::LinalgOp linalgOp, b, cast(currentOp.getOperation()), tileOption); if (failed(tilingResult)) return failure(); - b.replaceOp(currentOp, tilingResult->replacements); + b.replaceOp(currentOp, tilingResult->mergeResult.replacements); currentOp = dyn_cast(tilingResult->tiledOps.back()); } } @@ -1029,8 +1029,7 @@ struct DeepTileContractionOp dialect->getCanonicalizationPatterns(patterns); for (RegisteredOperationName op : ctx.getRegisteredOperations()) op.getCanonicalizationPatterns(patterns, &ctx); - if (failed( - applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) return signalPassFailure(); } }; diff --git a/lib/gc/Transforms/FoldTensorOperation.cpp b/lib/gc/Transforms/FoldTensorOperation.cpp index e0bf23abb..abd84ab16 100644 --- a/lib/gc/Transforms/FoldTensorOperation.cpp +++ b/lib/gc/Transforms/FoldTensorOperation.cpp @@ -44,8 +44,7 @@ struct FoldTensorOperationPass // Use to remove useless tensor operation like extract or // insert slice. config.strictMode = GreedyRewriteStrictness::ExistingOps; - (void)applyPatternsAndFoldGreedily(getOperation(), std::move(pattern), - config); + (void)applyPatternsGreedily(getOperation(), std::move(pattern), config); } }; } // namespace diff --git a/lib/gc/Transforms/GPU/AllocsToSLM.cpp b/lib/gc/Transforms/GPU/AllocsToSLM.cpp index 46ec2a4ad..06c4dce6b 100644 --- a/lib/gc/Transforms/GPU/AllocsToSLM.cpp +++ b/lib/gc/Transforms/GPU/AllocsToSLM.cpp @@ -152,7 +152,7 @@ struct AllocsToSLM : public gc::impl::AllocsToSLMBase { RewritePatternSet patterns(ctx); patterns.add(patterns.getContext()); - (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); + (void)applyPatternsGreedily(getOperation(), std::move(patterns)); } }; diff --git a/lib/gc/Transforms/GPU/IMEX/LinalgToXeGPU.cpp b/lib/gc/Transforms/GPU/IMEX/LinalgToXeGPU.cpp index bc78fe937..8edeca784 100644 --- a/lib/gc/Transforms/GPU/IMEX/LinalgToXeGPU.cpp +++ b/lib/gc/Transforms/GPU/IMEX/LinalgToXeGPU.cpp @@ -2124,17 +2124,17 @@ struct LinalgToXeGPU : public gc::impl::LinalgToXeGPUBase { // Run GEMM pattern first to allow fusion with its consumers. RewritePatternSet gemmPatterns(&getContext()); populateLinalgGemmToXeGPUPatterns(gemmPatterns, options); - (void)applyPatternsAndFoldGreedily(getOperation(), std::move(gemmPatterns)); + (void)applyPatternsGreedily(getOperation(), std::move(gemmPatterns)); // Convert memory fill ops. RewritePatternSet fillPatterns(&getContext()); populateLinalgMemoryFillToXeGPUPatterns(fillPatterns, options); - (void)applyPatternsAndFoldGreedily(getOperation(), std::move(fillPatterns)); + (void)applyPatternsGreedily(getOperation(), std::move(fillPatterns)); // Convert other remaining ops. RewritePatternSet patterns(&getContext()); populateLinalgEltwiseToXeGPUPatterns(patterns, options); - (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)); + (void)applyPatternsGreedily(getOperation(), std::move(patterns)); } }; diff --git a/lib/gc/Transforms/GPU/Pipeline.cpp b/lib/gc/Transforms/GPU/Pipeline.cpp index 5386fbf38..f90d9f562 100644 --- a/lib/gc/Transforms/GPU/Pipeline.cpp +++ b/lib/gc/Transforms/GPU/Pipeline.cpp @@ -154,7 +154,8 @@ void populateGPUPipeline(OpPassManager &pm, pm.addPass(createGpuKernelOutliningPass()); pm.addPass(createConvertXeVMToLLVMPass()); pm.addPass(createGpuXeVMAttachTarget()); - pm.addNestedPass(createConvertGpuOpsToLLVMSPVOps()); + pm.addNestedPass( + createConvertGpuOpsToLLVMSPVOps({.use64bitIndex = true})); pm.addNestedPass(createConvertIndexToLLVMPass()); pm.addNestedPass(createArithToLLVMConversionPass()); pm.addPass(createReconcileUnrealizedCastsPass()); diff --git a/lib/gc/Transforms/IterativeTilingAndFusion.cpp b/lib/gc/Transforms/IterativeTilingAndFusion.cpp index a486c29b0..d492e01e2 100644 --- a/lib/gc/Transforms/IterativeTilingAndFusion.cpp +++ b/lib/gc/Transforms/IterativeTilingAndFusion.cpp @@ -813,7 +813,7 @@ void iterativeTilingAndFusionUntilExhaustion( defaultTilingOfType(rewriter, op, isaOpTy, cfg); if (succeeded(tilingResult)) { tiledOps.insert(tilingResult->tiledOps[0]); - rewriter.replaceOp(op, tilingResult->replacements); + rewriter.replaceOp(op, tilingResult->mergeResult.replacements); break; } } diff --git a/lib/gc/Transforms/LowerToTileVector.cpp b/lib/gc/Transforms/LowerToTileVector.cpp index d105eaeb8..9690b2461 100644 --- a/lib/gc/Transforms/LowerToTileVector.cpp +++ b/lib/gc/Transforms/LowerToTileVector.cpp @@ -614,8 +614,7 @@ struct LowerToTileVectorPass // Init patterns use to remove useless tensor operation like extract or // insert slice. configInit.strictMode = GreedyRewriteStrictness::ExistingOps; - (void)applyPatternsAndFoldGreedily(funcOp, std::move(patternsInit), - configInit); + (void)applyPatternsGreedily(funcOp, std::move(patternsInit), configInit); RewritePatternSet firstPatterns(ctx); // All the dynamic shape will reject to lower. @@ -623,8 +622,8 @@ struct LowerToTileVectorPass GreedyRewriteConfig configFirstPn; // We only apply the lowering pattern on existing operations configFirstPn.strictMode = GreedyRewriteStrictness::ExistingOps; - (void)applyPatternsAndFoldGreedily(funcOp, std::move(firstPatterns), - configFirstPn); + (void)applyPatternsGreedily(funcOp, std::move(firstPatterns), + configFirstPn); // Error case: // ``` // linalg.copy : <1x32xf32> @@ -649,10 +648,10 @@ struct LowerToTileVectorPass vector::populateVectorTransferPermutationMapLoweringPatterns(secondPattern); // Remove unnessary broadcast operation vector::populateSinkVectorOpsPatterns(secondPattern); - // Second fold (with the help of the `applyPatternsAndFoldGreedily` + // Second fold (with the help of the `applyPatternsGreedily` // function) can help us to eliminate redundant operation like consecutive // read and write. - (void)applyPatternsAndFoldGreedily(funcOp, std::move(secondPattern)); + (void)applyPatternsGreedily(funcOp, std::move(secondPattern)); // may need other patterns to reduce redundant operations } }; diff --git a/lib/gc/Transforms/MemRefToCPURuntime.cpp b/lib/gc/Transforms/MemRefToCPURuntime.cpp index d18506e54..2498ad83a 100644 --- a/lib/gc/Transforms/MemRefToCPURuntime.cpp +++ b/lib/gc/Transforms/MemRefToCPURuntime.cpp @@ -51,7 +51,7 @@ uint64_t getMemRefSizeInBytes(MemRefType memrefType) { if (!layout.isIdentity()) { int64_t offset; SmallVector strides; - if (failed(getStridesAndOffset(memrefType, strides, offset))) { + if (failed(memrefType.getStridesAndOffset(strides, offset))) { return UINT64_MAX; } diff --git a/lib/gc/Transforms/MergeNestedForall.cpp b/lib/gc/Transforms/MergeNestedForall.cpp index 07eb5ffbf..bd35e2e9d 100644 --- a/lib/gc/Transforms/MergeNestedForall.cpp +++ b/lib/gc/Transforms/MergeNestedForall.cpp @@ -82,8 +82,7 @@ struct MergeNestedForall patterns.add(patterns.getContext()); - if (failed( - applyPatternsAndFoldGreedily(getOperation(), std::move(patterns)))) + if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) return signalPassFailure(); } }; diff --git a/lib/gc/Transforms/Microkernel/ConvertLinalgToMicrokernel.cpp b/lib/gc/Transforms/Microkernel/ConvertLinalgToMicrokernel.cpp index 0eabd6e1b..c312abe6f 100644 --- a/lib/gc/Transforms/Microkernel/ConvertLinalgToMicrokernel.cpp +++ b/lib/gc/Transforms/Microkernel/ConvertLinalgToMicrokernel.cpp @@ -391,7 +391,7 @@ class ConvertLinalgToMicrokernel patterns.add>( &getContext()); FrozenRewritePatternSet patternSet(std::move(patterns)); - if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet))) + if (failed(applyPatternsGreedily(getOperation(), patternSet))) signalPassFailure(); } }; diff --git a/lib/gc/Transforms/Microkernel/ConvertMicrokernelToDnnlFunc.cpp b/lib/gc/Transforms/Microkernel/ConvertMicrokernelToDnnlFunc.cpp index 647d8f784..8a5d97f0a 100644 --- a/lib/gc/Transforms/Microkernel/ConvertMicrokernelToDnnlFunc.cpp +++ b/lib/gc/Transforms/Microkernel/ConvertMicrokernelToDnnlFunc.cpp @@ -63,7 +63,7 @@ class ConvertBrgemmDispatchOpRewriter SmallVector operands; SmallVector operandTypes; IntegerType integer64 = IntegerType::get(rewriter.getContext(), 64); - FloatType float32 = FloatType::getF32(rewriter.getContext()); + FloatType float32 = Float32Type::get(rewriter.getContext()); // M, N, K, LDA, LDB, LDC, stride_a, stride_b // they are in the same order with BrgemmDispatchOp inputs @@ -215,7 +215,7 @@ class ConvertMicrokernelToDnnlFunc &getContext()); FrozenRewritePatternSet patternSet(std::move(patterns)); - if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet))) + if (failed(applyPatternsGreedily(getOperation(), patternSet))) signalPassFailure(); } }; diff --git a/lib/gc/Transforms/Microkernel/EarlyDispatchMicrokernel.cpp b/lib/gc/Transforms/Microkernel/EarlyDispatchMicrokernel.cpp index 2f66feee4..058d55357 100644 --- a/lib/gc/Transforms/Microkernel/EarlyDispatchMicrokernel.cpp +++ b/lib/gc/Transforms/Microkernel/EarlyDispatchMicrokernel.cpp @@ -205,8 +205,7 @@ class EarlyDispatchMicrokernel // Ignore newly created Ops GreedyRewriteConfig config; config.strictMode = GreedyRewriteStrictness::ExistingOps; - if (failed( - applyPatternsAndFoldGreedily(getOperation(), patternSet, config))) + if (failed(applyPatternsGreedily(getOperation(), patternSet, config))) signalPassFailure(); } }; diff --git a/lib/gc/Transforms/Microkernel/ExpandMicrokernel.cpp b/lib/gc/Transforms/Microkernel/ExpandMicrokernel.cpp index 9e58a76cf..164edb609 100644 --- a/lib/gc/Transforms/Microkernel/ExpandMicrokernel.cpp +++ b/lib/gc/Transforms/Microkernel/ExpandMicrokernel.cpp @@ -275,7 +275,7 @@ class ExpandMicrokernel patterns.add(&getContext()); FrozenRewritePatternSet patternSet(std::move(patterns)); - if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet))) + if (failed(applyPatternsGreedily(getOperation(), patternSet))) signalPassFailure(); } }; diff --git a/lib/gc/Transforms/Microkernel/MergeBranchMicrokernelContext.cpp b/lib/gc/Transforms/Microkernel/MergeBranchMicrokernelContext.cpp index 9865f5220..59554ef67 100644 --- a/lib/gc/Transforms/Microkernel/MergeBranchMicrokernelContext.cpp +++ b/lib/gc/Transforms/Microkernel/MergeBranchMicrokernelContext.cpp @@ -296,7 +296,7 @@ class MergeBranchMicrokernelContext patterns.add(&getContext(), dispatchAnalysis); FrozenRewritePatternSet patternSet(std::move(patterns)); - if (failed(applyPatternsAndFoldGreedily(getOperation(), patternSet))) { + if (failed(applyPatternsGreedily(getOperation(), patternSet))) { signalPassFailure(); } } diff --git a/lib/gc/Transforms/Microkernel/MicrokernelInvariantCodeMotion.cpp b/lib/gc/Transforms/Microkernel/MicrokernelInvariantCodeMotion.cpp index ad8a0631f..4363795ca 100644 --- a/lib/gc/Transforms/Microkernel/MicrokernelInvariantCodeMotion.cpp +++ b/lib/gc/Transforms/Microkernel/MicrokernelInvariantCodeMotion.cpp @@ -421,8 +421,7 @@ class MicrokernelInvariantCodeMotion // Ignore newly created Ops GreedyRewriteConfig config; config.strictMode = GreedyRewriteStrictness::ExistingOps; - if (failed( - applyPatternsAndFoldGreedily(getOperation(), patternSet, config))) { + if (failed(applyPatternsGreedily(getOperation(), patternSet, config))) { signalPassFailure(); } } diff --git a/lib/gc/Transforms/OneDNNGraphToLinalg.cpp b/lib/gc/Transforms/OneDNNGraphToLinalg.cpp index 5a75c37cd..138d3176d 100644 --- a/lib/gc/Transforms/OneDNNGraphToLinalg.cpp +++ b/lib/gc/Transforms/OneDNNGraphToLinalg.cpp @@ -515,8 +515,7 @@ struct ConvertOneDNNGraphToLinalg MatMulOpBatchFlatten // clang-format on >(ctx); - if (failed(applyPatternsAndFoldGreedily(getOperation(), - std::move(patternsPre)))) { + if (failed(applyPatternsGreedily(getOperation(), std::move(patternsPre)))) { signalPassFailure(); } // ========================================== diff --git a/lib/gc/Transforms/Utils/ValueUtils.cpp b/lib/gc/Transforms/Utils/ValueUtils.cpp index c6285df18..6db2fa5df 100644 --- a/lib/gc/Transforms/Utils/ValueUtils.cpp +++ b/lib/gc/Transforms/Utils/ValueUtils.cpp @@ -110,7 +110,7 @@ FailureOr> getStrides(Value value) { auto memrefType = cast(valueType); SmallVector strides; int64_t offset; - if (failed(getStridesAndOffset(memrefType, strides, offset))) + if (failed(memrefType.getStridesAndOffset(strides, offset))) return failure(); return strides; } diff --git a/src/dnnl/JsonParser.h b/src/dnnl/JsonParser.h index 6d9bc2893..9615219d8 100644 --- a/src/dnnl/JsonParser.h +++ b/src/dnnl/JsonParser.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2024 Intel Corporation + * Copyright (C) 2025 Intel Corporation * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -12,7 +12,6 @@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions * and limitations under the License. - * * SPDX-License-Identifier: Apache-2.0 */ @@ -179,8 +178,8 @@ class JsonParser { GC_DTYPE("u8", b.getIntegerType(8, true)), GC_DTYPE("f64", b.getF64Type()), GC_DTYPE("boolean", b.getI1Type()), - GC_DTYPE("f8_e5m2", b.getFloat8E5M2Type()), - GC_DTYPE("f8_e4m3", b.getFloat8E4M3FNType()), + GC_DTYPE("f8_e5m2", mlir::Float8E5M2Type::get(b.getContext())), + GC_DTYPE("f8_e4m3", mlir::Float8E4M3Type::get(b.getContext())), GC_DTYPE("s4", b.getIntegerType(4, false)), GC_DTYPE("u4", b.getIntegerType(4, true)), }; diff --git a/test/benchgc/src/benchgc/mlir/util.py b/test/benchgc/src/benchgc/mlir/util.py index 9ff5b8f45..26c2c1e50 100644 --- a/test/benchgc/src/benchgc/mlir/util.py +++ b/test/benchgc/src/benchgc/mlir/util.py @@ -187,12 +187,12 @@ def attach_dlti(flags: argparse.Namespace, module: ir.Module): dlti_template = f""" module attributes {{ dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< - #dlti.dl_entry<"L1_cache_size_in_bytes", {l1_data_cache_size} : ui32>, - #dlti.dl_entry<"L2_cache_size_in_bytes", {l2_cache_size} : ui64>, - #dlti.dl_entry<"L3_cache_size_in_bytes", {l3_cache_size} : ui64>, - #dlti.dl_entry<"num_threads", {num_threads} : i32>, - #dlti.dl_entry<"max_vector_width", {max_vector_width} : i64>> + "CPU" = #dlti.target_device_spec< + "L1_cache_size_in_bytes" = {l1_data_cache_size} : ui32, + "L2_cache_size_in_bytes" = {l2_cache_size} : ui64, + "L3_cache_size_in_bytes" = {l3_cache_size} : ui64, + "num_threads" = {num_threads} : i32>, + "max_vector_width" = {max_vector_width} : i64> >}} {{}} """ with module.context: diff --git a/test/mlir/test/gc/Conversion/GPU/XeGPUToXeVM/fence.mlir b/test/mlir/test/gc/Conversion/GPU/XeGPUToXeVM/fence.mlir new file mode 100644 index 000000000..692764539 --- /dev/null +++ b/test/mlir/test/gc/Conversion/GPU/XeGPUToXeVM/fence.mlir @@ -0,0 +1,14 @@ +// RUN: gc-opt -convert-xegpu-to-xevm -split-input-file %s | FileCheck %s + +gpu.module @fence_check { + gpu.func @fence(%dst: memref<8x16xf32, 1>) kernel { + %tid_x = gpu.thread_id x + %tid_x_i32 = arith.index_cast %tid_x : index to i32 + %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32 + // CHECK: xevm.memfence addrspace = global, scope = workgroup + xegpu.fence memory_kind = global, fence_scope = workgroup + %c0 = arith.constant 0 : index + memref.store %tid_x_f32, %dst[%c0, %c0] : memref<8x16xf32, 1> + gpu.return + } +} diff --git a/test/mlir/test/gc/Conversion/GPU/XeGPUToXeVM/loadstore_nd.mlir b/test/mlir/test/gc/Conversion/GPU/XeGPUToXeVM/loadstore_nd.mlir new file mode 100644 index 000000000..a03c77a05 --- /dev/null +++ b/test/mlir/test/gc/Conversion/GPU/XeGPUToXeVM/loadstore_nd.mlir @@ -0,0 +1,64 @@ +// RUN: gc-opt -convert-xegpu-to-xevm -split-input-file %s | FileCheck %s + +gpu.module @fence_check { + gpu.func @fence(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel { + %srcce = memref.memory_space_cast %src : memref<8x16xf32, 1> to memref<8x16xf32> + %dstte = memref.memory_space_cast %dst : memref<8x16xf32, 1> to memref<8x16xf32> + + // CHECK: %[[LD_PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64 + // CHECK: %[[LD_CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64> + // CHECK: %[[LD_DESC_0:.*]] = vector.insert %[[LD_PTR_AS_I64]], %[[LD_CREATE_DESC_I64]] [0] : i64 into vector<4xi64> + // CHECK: %[[LD_DESC_1:.*]] = vector.bitcast %[[LD_DESC_0]] : vector<4xi64> to vector<8xi32> + // CHECK: %[[LD_DESC_2:.*]] = vector.insert {{.*}}, %[[LD_DESC_1]] [2] : i32 into vector<8xi32> + // CHECK: %[[LD_DESC_3:.*]] = vector.insert {{.*}}, %[[LD_DESC_2]] [3] : i32 into vector<8xi32> + // CHECK: %[[LD_DESC_4:.*]] = vector.insert {{.*}}, %[[LD_DESC_3]] [4] : i32 into vector<8xi32> + // CHECK: %[[LD_DESC:.*]] = vector.insert {{.*}}, %[[LD_DESC_4]] [5] : i32 into vector<8xi32> + %src_tdesc = xegpu.create_nd_tdesc %srcce[0, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + + //CHECK: %[[LD_DESC_I64:.*]] = vector.bitcast %[[LD_DESC]] : vector<8xi32> to vector<4xi64> + //CHECK: %[[LD_INTPTR:.*]] = vector.extract %[[LD_DESC_I64]][0] : i64 from vector<4xi64> + //CHECK: %[[LD_BASE_W:.*]] = vector.extract %[[LD_DESC]][2] : i32 from vector<8xi32> + //CHECK: %[[LD_BASE_H:.*]] = vector.extract %[[LD_DESC]][3] : i32 from vector<8xi32> + //CHECK: %[[LD_TILE_W:.*]] = vector.extract %[[LD_DESC]][4] : i32 from vector<8xi32> + //CHECK: %[[LD_TILE_H:.*]] = vector.extract %[[LD_DESC]][5] : i32 from vector<8xi32> + //CHECK: %[[LD_LLVMPTR:.*]] = llvm.inttoptr %[[LD_INTPTR]] : i64 to !llvm.ptr<1> + //CHECK: %[[LD_SIZEOF_F32:.*]] = arith.constant 4 : i32 + //CHECK: %[[LD_BASE_ROW_IN_BYTES:.*]] = arith.muli %[[LD_BASE_W]], %[[LD_SIZEOF_F32]] : i32 + //CHECK: %[[LD_LOADED_I32:.*]] = xevm.blockload2d %[[LD_LLVMPTR]], %[[LD_BASE_ROW_IN_BYTES]], %[[LD_BASE_H]], %[[LD_BASE_ROW_IN_BYTES]], %[[LD_TILE_W]], %[[LD_TILE_H]] {elem_size_in_bits = 32, tile_width = 16, tile_height = 8, v_blocks = 1, transpose = false, vnni_transform = false, l1_cache_control = C, l3_cache_control = UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> + %loaded = xegpu.load_nd %src_tdesc <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> -> vector<8x1xf32> + //CHECK: %[[LD_LOADED_F32:.*]] = vector.bitcast %[[LD_LOADED_I32]] : vector<8xi32> to vector<8xf32> + //CHECK: %[[LD_LOADED_F32_DISTRIBUTED:.*]] = vector.shape_cast %[[LD_LOADED_F32]] : vector<8xf32> to vector<8x1xf32> + + %tid_x = gpu.thread_id x + %tid_x_i32 = arith.index_cast %tid_x : index to i32 + %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32 + //CHECK: %[[LOADED_F32_DISTRIBUTED_MODIFIED:.*]] = vector.insert %{{.*}}, %[[LD_LOADED_F32_DISTRIBUTED]] [0, 0] : f32 into vector<8x1xf32> + %loaded_modified = vector.insert %tid_x_f32, %loaded[0, 0] : f32 into vector<8x1xf32> + + // CHECK: %[[PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64 + // CHECK: %[[CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64> + // CHECK: %[[DESC_0:.*]] = vector.insert %[[PTR_AS_I64]], %[[CREATE_DESC_I64]] [0] : i64 into vector<4xi64> + // CHECK: %[[DESC_1:.*]] = vector.bitcast %[[DESC_0]] : vector<4xi64> to vector<8xi32> + // CHECK: %[[DESC_2:.*]] = vector.insert {{.*}}, %[[DESC_1]] [2] : i32 into vector<8xi32> + // CHECK: %[[DESC_3:.*]] = vector.insert {{.*}}, %[[DESC_2]] [3] : i32 into vector<8xi32> + // CHECK: %[[DESC_4:.*]] = vector.insert {{.*}}, %[[DESC_3]] [4] : i32 into vector<8xi32> + // CHECK: %[[DESC:.*]] = vector.insert {{.*}}, %[[DESC_4]] [5] : i32 into vector<8xi32> + %dst_tdesc = xegpu.create_nd_tdesc %dstte[0, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + + //CHECK: %[[DESC_I64:.*]] = vector.bitcast %[[DESC]] : vector<8xi32> to vector<4xi64> + //CHECK: %[[INTPTR:.*]] = vector.extract %[[DESC_I64]][0] : i64 from vector<4xi64> + //CHECK: %[[BASE_W:.*]] = vector.extract %[[DESC]][2] : i32 from vector<8xi32> + //CHECK: %[[BASE_H:.*]] = vector.extract %[[DESC]][3] : i32 from vector<8xi32> + //CHECK: %[[TILE_W:.*]] = vector.extract %[[DESC]][4] : i32 from vector<8xi32> + //CHECK: %[[TILE_H:.*]] = vector.extract %[[DESC]][5] : i32 from vector<8xi32> + //CHECK: %[[LLVMPTR:.*]] = llvm.inttoptr %[[INTPTR]] : i64 to !llvm.ptr<1> + //CHECK: %[[SIZEOF_F32:.*]] = arith.constant 4 : i32 + //CHECK: %[[BASE_ROW_IN_BYTES:.*]] = arith.muli %[[BASE_W]], %[[SIZEOF_F32]] : i32 + //CHECK: %[[FLAT_VALUE:.*]] = vector.shape_cast %[[LOADED_F32_DISTRIBUTED_MODIFIED]] : vector<8x1xf32> to vector<8xf32> + //CHECK: %[[FLAT_VALUE_I32:.*]] = vector.bitcast %[[FLAT_VALUE]] : vector<8xf32> to vector<8xi32> + //CHECK: xevm.blockstore2d %[[LLVMPTR]], %[[BASE_ROW_IN_BYTES]], %[[BASE_H]], %[[BASE_ROW_IN_BYTES]], %[[TILE_W]], %[[TILE_H]], %[[FLAT_VALUE_I32]] {elem_size_in_bits = 32, tile_width = 16, tile_height = 8, v_blocks = 1, l1_cache_control = WB, l3_cache_control = UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>) + xegpu.store_nd %loaded_modified, %dst_tdesc <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + gpu.return + } +} + diff --git a/test/mlir/test/gc/Conversion/GPU/XeGPUToXeVM/prefetch_nd.mlir b/test/mlir/test/gc/Conversion/GPU/XeGPUToXeVM/prefetch_nd.mlir new file mode 100644 index 000000000..8b88a70e1 --- /dev/null +++ b/test/mlir/test/gc/Conversion/GPU/XeGPUToXeVM/prefetch_nd.mlir @@ -0,0 +1,34 @@ +// RUN: gc-opt -convert-xegpu-to-xevm -split-input-file %s | FileCheck %s + +gpu.module @fence_check { + gpu.func @fence(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel { + %srcce = memref.memory_space_cast %src : memref<8x16xf32, 1> to memref<8x16xf32> + %dstte = memref.memory_space_cast %dst : memref<8x16xf32, 1> to memref<8x16xf32> + + // CHECK: %[[LD_PTR_AS_I64:.*]] = arith.index_castui {{.*}} : index to i64 + // CHECK: %[[LD_CREATE_DESC_I64:.*]] = vector.bitcast {{.*}} : vector<8xi32> to vector<4xi64> + // CHECK: %[[LD_DESC_0:.*]] = vector.insert %[[LD_PTR_AS_I64]], %[[LD_CREATE_DESC_I64]] [0] : i64 into vector<4xi64> + // CHECK: %[[LD_DESC_1:.*]] = vector.bitcast %[[LD_DESC_0]] : vector<4xi64> to vector<8xi32> + // CHECK: %[[LD_DESC_2:.*]] = vector.insert {{.*}}, %[[LD_DESC_1]] [2] : i32 into vector<8xi32> + // CHECK: %[[LD_DESC_3:.*]] = vector.insert {{.*}}, %[[LD_DESC_2]] [3] : i32 into vector<8xi32> + // CHECK: %[[LD_DESC_4:.*]] = vector.insert {{.*}}, %[[LD_DESC_3]] [4] : i32 into vector<8xi32> + // CHECK: %[[LD_DESC:.*]] = vector.insert {{.*}}, %[[LD_DESC_4]] [5] : i32 into vector<8xi32> + %src_tdesc = xegpu.create_nd_tdesc %srcce[0, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + + //CHECK: %[[LD_DESC_I64:.*]] = vector.bitcast %[[LD_DESC]] : vector<8xi32> to vector<4xi64> + //CHECK: %[[PREF_INTPTR:.*]] = vector.extract %[[LD_DESC_I64]][0] : i64 from vector<4xi64> + //CHECK: %[[PREF_BASE_W:.*]] = vector.extract %[[LD_DESC]][2] : i32 from vector<8xi32> + //CHECK: %[[PREF_BASE_H:.*]] = vector.extract %[[LD_DESC]][3] : i32 from vector<8xi32> + //CHECK: %[[PREF_TILE_W:.*]] = vector.extract %[[LD_DESC]][4] : i32 from vector<8xi32> + //CHECK: %[[PREF_TILE_H:.*]] = vector.extract %[[LD_DESC]][5] : i32 from vector<8xi32> + //CHECK: %[[PREF_LLVMPTR:.*]] = llvm.inttoptr %[[PREF_INTPTR]] : i64 to !llvm.ptr<1> + //CHECK: %[[PREF_SIZEOF_F32:.*]] = arith.constant 4 : i32 + //CHECK: %[[PREF_BASE_ROW_IN_BYTES:.*]] = arith.muli %[[PREF_BASE_W]], %[[PREF_SIZEOF_F32]] : i32 + //CHECK: xevm.blockprefetch2d %[[PREF_LLVMPTR]], %[[PREF_BASE_ROW_IN_BYTES]], %[[PREF_BASE_H]], %[[PREF_BASE_ROW_IN_BYTES]], %[[PREF_TILE_W]], %[[PREF_TILE_H]] {elem_size_in_bits = 32, tile_width = 16, tile_height = 8, v_blocks = 1, l1_cache_control = C, l3_cache_control = UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) + xegpu.prefetch_nd %src_tdesc<{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + + gpu.return + } +} + +// /home/jovyan/graph-compiler/build/bin/gc-opt /home/jovyan/graph-compiler/test/mlir/test/gc/Conversion/GPU/XeGPUToXeVM/prefetch_nd.mlir --convert-xegpu-to-xevm \ No newline at end of file diff --git a/test/mlir/test/gc/Conversion/GPU/XeVMToLLVM/blockload2d.mlir b/test/mlir/test/gc/Conversion/GPU/XeVMToLLVM/blockload2d.mlir index 65278adaa..7e3e59660 100644 --- a/test/mlir/test/gc/Conversion/GPU/XeVMToLLVM/blockload2d.mlir +++ b/test/mlir/test/gc/Conversion/GPU/XeVMToLLVM/blockload2d.mlir @@ -374,7 +374,7 @@ llvm.func @xevm.blockload2d(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height // CHECK: llvm.func @xevm.blockload2d( // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( // CHECK: xevm.DecorationCacheControl = {{\[\[}}6442 : i32, 0 : i32, 1 : i32, 0 : i32{{\]}}, {{\[}}6442 : i32, 1 : i32, 1 : i32, 0 : i32{{\]\]}} - %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, l1_cache_control=L1UC, l3_cache_control=L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, l1_cache_control=UC, l3_cache_control=UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> llvm.return } } @@ -386,7 +386,7 @@ llvm.func @xevm.blockload2d(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height // CHECK: llvm.func @xevm.blockload2d( // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( // CHECK: xevm.DecorationCacheControl = {{\[\[}}6442 : i32, 0 : i32, 1 : i32, 0 : i32{{\]}}, {{\[}}6442 : i32, 1 : i32, 2 : i32, 0 : i32{{\]\]}} - %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, l1_cache_control=L1UC, l3_cache_control=L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, l1_cache_control=UC, l3_cache_control=C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> llvm.return } } @@ -398,7 +398,7 @@ llvm.func @xevm.blockload2d(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height // CHECK: llvm.func @xevm.blockload2d( // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( // CHECK: xevm.DecorationCacheControl = {{\[\[}}6442 : i32, 0 : i32, 2 : i32, 0 : i32{{\]}}, {{\[}}6442 : i32, 1 : i32, 1 : i32, 0 : i32{{\]\]}} - %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, l1_cache_control=L1C, l3_cache_control=L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, l1_cache_control=C, l3_cache_control=UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> llvm.return } } @@ -410,7 +410,7 @@ llvm.func @xevm.blockload2d(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height // CHECK: llvm.func @xevm.blockload2d( // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( // CHECK: xevm.DecorationCacheControl = {{\[\[}}6442 : i32, 0 : i32, 2 : i32, 0 : i32{{\]}}, {{\[}}6442 : i32, 1 : i32, 2 : i32, 0 : i32{{\]\]}} - %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, l1_cache_control=L1C, l3_cache_control=L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, l1_cache_control=C, l3_cache_control=C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> llvm.return } } @@ -422,7 +422,7 @@ llvm.func @xevm.blockload2d(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height // CHECK: llvm.func @xevm.blockload2d( // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( // CHECK: xevm.DecorationCacheControl = {{\[\[}}6442 : i32, 0 : i32, 3 : i32, 0 : i32{{\]}}, {{\[}}6442 : i32, 1 : i32, 1 : i32, 0 : i32{{\]\]}} - %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, l1_cache_control=L1S, l3_cache_control=L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, l1_cache_control=S, l3_cache_control=UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> llvm.return } } @@ -434,7 +434,7 @@ llvm.func @xevm.blockload2d(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height // CHECK: llvm.func @xevm.blockload2d( // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( // CHECK: xevm.DecorationCacheControl = {{\[\[}}6442 : i32, 0 : i32, 3 : i32, 0 : i32{{\]}}, {{\[}}6442 : i32, 1 : i32, 2 : i32, 0 : i32{{\]\]}} - %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, l1_cache_control=L1S, l3_cache_control=L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, l1_cache_control=S, l3_cache_control=C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> llvm.return } } @@ -446,7 +446,7 @@ llvm.func @xevm.blockload2d(%ptr : !llvm.ptr<1>, %base_width : i32, %base_height // CHECK: llvm.func @xevm.blockload2d( // CHECK: llvm.call spir_funccc @_Z40intel_sub_group_2d_block_read_8b_8r32x2cPU3AS1viiiDv2_iPt( // CHECK: xevm.DecorationCacheControl = {{\[\[}}6442 : i32, 0 : i32, 4 : i32, 0 : i32{{\]}}, {{\[}}6442 : i32, 1 : i32, 2 : i32, 0 : i32{{\]\]}} - %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, l1_cache_control=L1IAR, l3_cache_control=L3C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + %0 = xevm.blockload2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=2, transpose=false, vnni_transform=false, l1_cache_control=IAR, l3_cache_control=C} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> llvm.return } } diff --git a/test/mlir/test/gc/Conversion/GPU/XeVMToLLVM/blockprefetch2d.mlir b/test/mlir/test/gc/Conversion/GPU/XeVMToLLVM/blockprefetch2d.mlir index 63c158bbf..f45a51a01 100644 --- a/test/mlir/test/gc/Conversion/GPU/XeVMToLLVM/blockprefetch2d.mlir +++ b/test/mlir/test/gc/Conversion/GPU/XeVMToLLVM/blockprefetch2d.mlir @@ -11,7 +11,7 @@ llvm.func @xevm.blockprefetch2d(%ptr : !llvm.ptr<1>, %base_width : i32, %base_he // CHECK-NEXT: llvm.call spir_funccc @_Z44intel_sub_group_2d_block_prefetch_8b_8r32x1cPU3AS1viiiDv2_i(%arg0, %arg1, %arg2, %arg3, [[COORD1]]) // CHECK: xevm.DecorationCacheControl = {{\[\[}}6442 : i32, 0 : i32, 1 : i32, 0 : i32{{\]}}, {{\[}}6442 : i32, 1 : i32, 1 : i32, 0 : i32{{\]\]}} // CHECK: (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>) -> () - xevm.blockprefetch2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=1, l1_cache_control=L1UC, l3_cache_control=L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) + xevm.blockprefetch2d %ptr, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=8, tile_width=32, tile_height=8, v_blocks=1, l1_cache_control=UC, l3_cache_control=UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) llvm.return } } diff --git a/test/mlir/test/gc/Conversion/GPU/XeVMToLLVM/blockstore2d.mlir b/test/mlir/test/gc/Conversion/GPU/XeVMToLLVM/blockstore2d.mlir index ccb06f029..5bbe5e0dc 100644 --- a/test/mlir/test/gc/Conversion/GPU/XeVMToLLVM/blockstore2d.mlir +++ b/test/mlir/test/gc/Conversion/GPU/XeVMToLLVM/blockstore2d.mlir @@ -15,7 +15,7 @@ llvm.func @xevm.blockstore2d(%ptr : !llvm.ptr<1>, %base_width : i32, %base_heigh // CHECK-NEXT: llvm.call spir_funccc @_Z41intel_sub_group_2d_block_write_8b_8r16x1cPU3AS1viiiDv2_iPh(%arg0, %arg1, %arg2, %arg3, [[COORD1]], [[STOREVALPTR]]) // CHECK: xevm.DecorationCacheControl = {{\[\[}}6443 : i32, 0 : i32, 1 : i32, 0 : i32{{\]}}, {{\[}}6443 : i32, 1 : i32, 1 : i32, 0 : i32{{\]\]}} // CHECK: : (!llvm.ptr<1>, i32, i32, i32, vector<2xi32>, !llvm.ptr) -> () - xevm.blockstore2d %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=8, tile_width=16, tile_height=8, v_blocks=1, l1_cache_control=L1UC, l3_cache_control=L3UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi8>) + xevm.blockstore2d %ptr, %base_width, %base_height, %base_pitch, %x, %y, %stored_val {elem_size_in_bits=8, tile_width=16, tile_height=8, v_blocks=1, l1_cache_control=UC, l3_cache_control=UC} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi8>) llvm.return } } diff --git a/test/mlir/test/gc/Dialect/CPURuntime/cpu-runner/allocators.mlir b/test/mlir/test/gc/Dialect/CPURuntime/cpu-runner/allocators.mlir index 399467290..35666487a 100644 --- a/test/mlir/test/gc/Dialect/CPURuntime/cpu-runner/allocators.mlir +++ b/test/mlir/test/gc/Dialect/CPURuntime/cpu-runner/allocators.mlir @@ -1,3 +1,4 @@ +// UNSUPPORTED: target={{.*}} // RUN: gc-opt %s --finalize-memref-to-llvm --convert-scf-to-cf --convert-cpuruntime-to-llvm --convert-func-to-llvm --reconcile-unrealized-casts | gc-cpu-runner -e main -entry-point-result=void -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils | FileCheck %s module { diff --git a/test/mlir/test/gc/Dialect/CPURuntime/memref-to-cpuruntime.mlir b/test/mlir/test/gc/Dialect/CPURuntime/memref-to-cpuruntime.mlir index c32cb618e..b25562fc0 100644 --- a/test/mlir/test/gc/Dialect/CPURuntime/memref-to-cpuruntime.mlir +++ b/test/mlir/test/gc/Dialect/CPURuntime/memref-to-cpuruntime.mlir @@ -1,3 +1,4 @@ +// UNSUPPORTED: target={{.*}} // RUN: gc-opt --split-input-file --convert-memref-to-cpuruntime %s -verify-diagnostics | FileCheck %s func.func @alloca() { diff --git a/test/mlir/test/gc/Transforms/GPU/module-to-binary-xevm.mlir b/test/mlir/test/gc/Transforms/GPU/module-to-binary-xevm.mlir index 3b3f4a26e..444edcda4 100644 --- a/test/mlir/test/gc/Transforms/GPU/module-to-binary-xevm.mlir +++ b/test/mlir/test/gc/Transforms/GPU/module-to-binary-xevm.mlir @@ -1,4 +1,4 @@ -// RUN: gc-opt %s --gpu-to-llvm --convert-gpu-to-llvm-spv --gpu-module-to-binary | FileCheck %s +// RUN: gc-opt %s --gpu-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-module-to-binary | FileCheck %s module attributes {gpu.container_module} { // CHECK-LABEL:gpu.binary @entry_kernel diff --git a/test/mlir/test/gc/Transforms/deepTileContractionNamedOp.mlir b/test/mlir/test/gc/Transforms/deepTileContractionNamedOp.mlir index 61848dcb7..ccb9ca418 100644 --- a/test/mlir/test/gc/Transforms/deepTileContractionNamedOp.mlir +++ b/test/mlir/test/gc/Transforms/deepTileContractionNamedOp.mlir @@ -150,12 +150,12 @@ func.func @matmul_2Dx4D_bf16(%arg0: tensor<4096x4096xbf16>, %arg1: tensor<128x12 module attributes { dlti.target_system_spec = #dlti.target_system_spec< - "CPU": #dlti.target_device_spec< - #dlti.dl_entry<"L1_cache_size_in_bytes", 49152 : i32>, - #dlti.dl_entry<"L2_cache_size_in_bytes", 2097152 : i32>, - #dlti.dl_entry<"L3_cache_size_in_bytes", 110100480 : i32>, - #dlti.dl_entry<"num_threads", 56 : i32>, - #dlti.dl_entry<"max_vector_width", 512 : i32>> + "CPU" = #dlti.target_device_spec< + "L1_cache_size_in_bytes" = 49152 : i32, + "L2_cache_size_in_bytes" = 2097152 : i32, + "L3_cache_size_in_bytes" = 110100480 : i32, + "num_threads" = 56 : i32, + "max_vector_width" = 512 : i32> >} { // CHECK: #[[mapA:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3 * 2 + d4)> // CHECK: #[[mapB:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d3, d2, d4)> diff --git a/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_nd.mlir b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_nd.mlir new file mode 100644 index 000000000..625dbc360 --- /dev/null +++ b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_nd.mlir @@ -0,0 +1,69 @@ +// RUN: gc-opt %s --convert-xegpu-to-xevm --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s + +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + gpu.func @load_store_2d(%src: memref<8x16xf32, 1>, %dst: memref<8x16xf32, 1>) kernel { + %srcce = memref.memory_space_cast %src : memref<8x16xf32, 1> to memref<8x16xf32> + %dstte = memref.memory_space_cast %dst : memref<8x16xf32, 1> to memref<8x16xf32> + + %src_tdesc = xegpu.create_nd_tdesc %srcce[0, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + %dst_tdesc = xegpu.create_nd_tdesc %dstte[0, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + %loaded = xegpu.load_nd %src_tdesc <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> -> vector<8x1xf32> + + %tid_x = gpu.thread_id x + %tid_x_i32 = arith.index_cast %tid_x : index to i32 + %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32 + + %loaded_modified = vector.insert %tid_x_f32, %loaded[0, 0] : f32 into vector<8x1xf32> + + xegpu.store_nd %loaded_modified, %dst_tdesc <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + gpu.return + } + } + + func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %memref_src = gpu.alloc host_shared () : memref<8x16xf32> + memref.copy %src, %memref_src : memref<8x16xf32> to memref<8x16xf32> + %memref_dst = gpu.alloc host_shared () : memref<8x16xf32> + %srcc = memref.memory_space_cast %memref_src : memref<8x16xf32> to memref<8x16xf32, 1> + %dstt = memref.memory_space_cast %memref_dst : memref<8x16xf32> to memref<8x16xf32, 1> + + gpu.launch_func @kernel::@load_store_2d blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%srcc : memref<8x16xf32, 1>, %dstt : memref<8x16xf32, 1>) + return %memref_dst : memref<8x16xf32> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %A = memref.alloc() : memref<8x16xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c11_f32 = arith.constant 11.11 : f32 + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32> + } + } + %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32> + %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32> + %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32> + call @printMemrefF32(%A_cast) : (memref<*xf32>) -> () + call @printMemrefF32(%B_cast) : (memref<*xf32>) -> () + + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [11.11{{.*}}] + // CHECK-COUNT-96: 11.11 + // CHECK-NEXT: [11.11{{.*}}] + + // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // CHECK-COUNT-96: 11.11 + // CHECK-NEXT: [11.11{{.*}}] + + memref.dealloc %A : memref<8x16xf32> + return + } + func.func private @printMemrefF32(%ptr : memref<*xf32>) +} diff --git a/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_nd_dpas.mlir b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_nd_dpas.mlir new file mode 100644 index 000000000..a6dd0119a --- /dev/null +++ b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_nd_dpas.mlir @@ -0,0 +1,101 @@ +// RUN: gc-opt %s --convert-xegpu-to-xevm --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s + +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + gpu.func @load_store_2d_dpas(%a: memref<8x16xf16, 1>, %b: memref<16x16xf16, 1>, %c: memref<8x16xf32, 1>) kernel { + %a_tdesc_memref = memref.memory_space_cast %a : memref<8x16xf16, 1> to memref<8x16xf16> + %b_tdesc_memref = memref.memory_space_cast %b : memref<16x16xf16, 1> to memref<16x16xf16> + %c_tdesc_memref = memref.memory_space_cast %c : memref<8x16xf32, 1> to memref<8x16xf32> + + %a_tdesc = xegpu.create_nd_tdesc %a_tdesc_memref[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr, #xegpu.sg_map> + %b_tdesc = xegpu.create_nd_tdesc %b_tdesc_memref[0, 0] : memref<16x16xf16> -> !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.sg_map> + %c_tdesc = xegpu.create_nd_tdesc %c_tdesc_memref[0, 0] : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + + %a_loaded = xegpu.load_nd %a_tdesc <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr, #xegpu.sg_map> -> vector<8x1xf16> + %b_loaded = xegpu.load_nd %b_tdesc <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<16x16xf16, #xegpu.block_tdesc_attr, #xegpu.sg_map> -> vector<16x1xf16> + %c_loaded = xegpu.load_nd %c_tdesc <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> -> vector<8x1xf32> + + %b_loaded_d = vector.shape_cast %b_loaded : vector<16x1xf16> to vector<1x16xf16> + %d = xegpu.dpas %a_loaded, %b_loaded_d, %c_loaded : vector<8x1xf16>, vector<1x16xf16>, vector<8x1xf32> -> vector<8x1xf32> + + xegpu.store_nd %d, %c_tdesc <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + gpu.return + } + } + + func.func @test(%a : memref<8x16xf16>, %b : memref<16x16xf16>, %c : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %memref_a = gpu.alloc host_shared () : memref<8x16xf16> + memref.copy %a, %memref_a : memref<8x16xf16> to memref<8x16xf16> + %memref_b = gpu.alloc host_shared () : memref<16x16xf16> + memref.copy %b, %memref_b : memref<16x16xf16> to memref<16x16xf16> + %memref_c = gpu.alloc host_shared () : memref<8x16xf32> + memref.copy %c, %memref_c : memref<8x16xf32> to memref<8x16xf32> + + + %a_gpu = memref.memory_space_cast %memref_a : memref<8x16xf16> to memref<8x16xf16, 1> + %b_gpu = memref.memory_space_cast %memref_b : memref<16x16xf16> to memref<16x16xf16, 1> + %c_gpu = memref.memory_space_cast %memref_c : memref<8x16xf32> to memref<8x16xf32, 1> + + gpu.launch_func @kernel::@load_store_2d_dpas blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%a_gpu : memref<8x16xf16, 1>, %b_gpu : memref<16x16xf16, 1>, %c_gpu : memref<8x16xf32, 1>) + return %memref_c : memref<8x16xf32> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c11_f32 = arith.constant 11.11 : f16 + + %A = memref.alloc() : memref<8x16xf16> + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + %row_idx = arith.index_cast %i : index to i32 + %row = arith.sitofp %row_idx : i32 to f16 + memref.store %row, %A[%i, %j] : memref<8x16xf16> + } + } + %B = memref.alloc() : memref<16x16xf16> + scf.for %i = %c0 to %c16 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + %col_idx = arith.index_cast %j : index to i32 + %col = arith.sitofp %col_idx : i32 to f16 + memref.store %col, %B[%i, %j] : memref<16x16xf16> + } + } + + %C = memref.alloc() : memref<8x16xf32> + %c0_f16 = arith.constant 0.0 : f32 + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c0_f16, %C[%i, %j] : memref<8x16xf32> + } + } + + %C_res = call @test(%A, %B, %C) : (memref<8x16xf16>, memref<16x16xf16>, memref<8x16xf32>) -> memref<8x16xf32> + %C_res_cast = memref.cast %C_res : memref<8x16xf32> to memref<*xf32> + // %A_cast = memref.cast %A : memref<8x16xf16> to memref<*xf16> + // call @printMemrefF16(%A_cast) : (memref<*xf16>) -> () + // %B_cast = memref.cast %B : memref<16x16xf16> to memref<*xf16> + // call @printMemrefF16(%B_cast) : (memref<*xf16>) -> () + + call @printMemrefF32(%C_res_cast) : (memref<*xf32>) -> () + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + // CHECK-NEXT: [0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240] + // CHECK-NEXT: [0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480] + // CHECK-NEXT: [0, 48, 96, 144, 192, 240, 288, 336, 384, 432, 480, 528, 576, 624, 672, 720] + // CHECK-NEXT: [0, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960] + // CHECK-NEXT: [0, 80, 160, 240, 320, 400, 480, 560, 640, 720, 800, 880, 960, 1040, 1120, 1200] + // CHECK-NEXT: [0, 96, 192, 288, 384, 480, 576, 672, 768, 864, 960, 1056, 1152, 1248, 1344, 1440] + // CHECK-NEXT: [0, 112, 224, 336, 448, 560, 672, 784, 896, 1008, 1120, 1232, 1344, 1456, 1568, 1680] + + + memref.dealloc %A : memref<8x16xf16> + return + } + func.func private @printMemrefF32(%ptr : memref<*xf32>) + func.func private @printMemrefF16(%ptr : memref<*xf16>) attributes {llvm.emit_c_interface} +} diff --git a/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_nd_prefetch.mlir b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_nd_prefetch.mlir new file mode 100644 index 000000000..066eaa44b --- /dev/null +++ b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_nd_prefetch.mlir @@ -0,0 +1,72 @@ +// RUN: gc-opt %s --convert-xegpu-to-xevm --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s + +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + gpu.func @load_store_2d(%src: memref<8x16xf16, 1>, %dst: memref<8x16xf16, 1>) kernel { + %srcce = memref.memory_space_cast %src : memref<8x16xf16, 1> to memref<8x16xf16> + %dstte = memref.memory_space_cast %dst : memref<8x16xf16, 1> to memref<8x16xf16> + + %src_tdesc = xegpu.create_nd_tdesc %srcce[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr, #xegpu.sg_map> + %dst_tdesc = xegpu.create_nd_tdesc %dstte[0, 0] : memref<8x16xf16> -> !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr, #xegpu.sg_map> + + xegpu.prefetch_nd %src_tdesc<{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr, #xegpu.sg_map> + + %loaded = xegpu.load_nd %src_tdesc <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr, #xegpu.sg_map> -> vector<8x1xf16> + + %tid_x = gpu.thread_id x + %tid_x_i32 = arith.index_cast %tid_x : index to i32 + %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f16 + + %loaded_modified = vector.insert %tid_x_f32, %loaded[0, 0] : f16 into vector<8x1xf16> + + xegpu.store_nd %loaded_modified, %dst_tdesc <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<8x1xf16>, !xegpu.tensor_desc<8x16xf16, #xegpu.block_tdesc_attr, #xegpu.sg_map> + gpu.return + } + } + + func.func @test(%src : memref<8x16xf16>) -> memref<8x16xf16> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %memref_src = gpu.alloc host_shared () : memref<8x16xf16> + memref.copy %src, %memref_src : memref<8x16xf16> to memref<8x16xf16> + %memref_dst = gpu.alloc host_shared () : memref<8x16xf16> + %srcc = memref.memory_space_cast %memref_src : memref<8x16xf16> to memref<8x16xf16, 1> + %dstt = memref.memory_space_cast %memref_dst : memref<8x16xf16> to memref<8x16xf16, 1> + + gpu.launch_func @kernel::@load_store_2d blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%srcc : memref<8x16xf16, 1>, %dstt : memref<8x16xf16, 1>) + return %memref_dst : memref<8x16xf16> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %A = memref.alloc() : memref<8x16xf16> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c11_f32 = arith.constant 11.0 : f16 + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c11_f32, %A[%i, %j] : memref<8x16xf16> + } + } + %B = call @test(%A) : (memref<8x16xf16>) -> memref<8x16xf16> + %B_cast = memref.cast %B : memref<8x16xf16> to memref<*xf16> + %A_cast = memref.cast %A : memref<8x16xf16> to memref<*xf16> + call @printMemrefF16(%A_cast) : (memref<*xf16>) -> () + call @printMemrefF16(%B_cast) : (memref<*xf16>) -> () + + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [11{{.*}}] + // CHECK-COUNT-96: 11 + // CHECK-NEXT: [11{{.*}}] + + // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // CHECK-COUNT-96: 11 + // CHECK-NEXT: [11{{.*}}] + + memref.dealloc %A : memref<8x16xf16> + return + } + func.func private @printMemrefF16(%ptr : memref<*xf16>) attributes {llvm.emit_c_interface} +} diff --git a/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_nd_update_offset.mlir b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_nd_update_offset.mlir new file mode 100644 index 000000000..b85ab0778 --- /dev/null +++ b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_nd_update_offset.mlir @@ -0,0 +1,77 @@ +// RUN: gc-opt %s --convert-xegpu-to-xevm --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s + +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + gpu.func @load_store_2d(%src: memref<16x16xf32, 1>, %dst: memref<16x16xf32, 1>) kernel { + %srcce = memref.memory_space_cast %src : memref<16x16xf32, 1> to memref<16x16xf32> + %dstte = memref.memory_space_cast %dst : memref<16x16xf32, 1> to memref<16x16xf32> + + %src_tdesc = xegpu.create_nd_tdesc %srcce[0, 0] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + %dst_tdesc = xegpu.create_nd_tdesc %dstte[0, 0] : memref<16x16xf32> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + %loaded = xegpu.load_nd %src_tdesc <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}> : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> -> vector<8x1xf32> + + %tid_x = gpu.thread_id x + %tid_x_i32 = arith.index_cast %tid_x : index to i32 + %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32 + + %loaded_modified = vector.insert %tid_x_f32, %loaded[0, 0] : f32 into vector<8x1xf32> + + xegpu.store_nd %loaded_modified, %dst_tdesc <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + + %c8 = arith.constant 8 : index + %c0 = arith.constant 0 : index + %dst_tdesc_new = xegpu.update_nd_offset %dst_tdesc, [%c8, %c0] : !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + xegpu.store_nd %loaded_modified, %dst_tdesc_new <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint}>: vector<8x1xf32>, !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr, #xegpu.sg_map> + gpu.return + } + } + + func.func @test(%src : memref<16x16xf32>) -> memref<16x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %memref_src = gpu.alloc host_shared () : memref<16x16xf32> + memref.copy %src, %memref_src : memref<16x16xf32> to memref<16x16xf32> + %memref_dst = gpu.alloc host_shared () : memref<16x16xf32> + %srcc = memref.memory_space_cast %memref_src : memref<16x16xf32> to memref<16x16xf32, 1> + %dstt = memref.memory_space_cast %memref_dst : memref<16x16xf32> to memref<16x16xf32, 1> + + gpu.launch_func @kernel::@load_store_2d blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%srcc : memref<16x16xf32, 1>, %dstt : memref<16x16xf32, 1>) + return %memref_dst : memref<16x16xf32> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %A = memref.alloc() : memref<16x16xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 16 : index + %c16 = arith.constant 16 : index + %c11_f32 = arith.constant 11.11 : f32 + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c11_f32, %A[%i, %j] : memref<16x16xf32> + } + } + %B = call @test(%A) : (memref<16x16xf32>) -> memref<16x16xf32> + %B_cast = memref.cast %B : memref<16x16xf32> to memref<*xf32> + %A_cast = memref.cast %A : memref<16x16xf32> to memref<*xf32> + call @printMemrefF32(%A_cast) : (memref<*xf32>) -> () + call @printMemrefF32(%B_cast) : (memref<*xf32>) -> () + + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [11.11{{.*}}] + // CHECK-COUNT-224: 11.11 + // CHECK-NEXT: [11.11{{.*}}] + + // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // CHECK-COUNT-96: 11.11 + // CHECK-NEXT: [11.11{{.*}}] + // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // CHECK-COUNT-96: 11.11 + // CHECK-NEXT: [11.11{{.*}}] + + memref.dealloc %A : memref<16x16xf32> + return + } + func.func private @printMemrefF32(%ptr : memref<*xf32>) +} diff --git a/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_scatter_chunk_size_1.mlir b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_scatter_chunk_size_1.mlir new file mode 100644 index 000000000..9251db2ba --- /dev/null +++ b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_scatter_chunk_size_1.mlir @@ -0,0 +1,75 @@ +// RUN: gc-opt %s --convert-xegpu-to-xevm --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm --convert-arith-to-llvm --finalize-memref-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s + +#sg_map = #xegpu.sg_map +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + gpu.func @load_store_2d(%src: memref<128xf32, 1>, %dst: memref<128xf32, 1>) kernel { + %srcce = memref.memory_space_cast %src : memref<128xf32, 1> to memref<128xf32> + %dstte = memref.memory_space_cast %dst : memref<128xf32, 1> to memref<128xf32> + + %offsets = arith.constant dense<[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]> : vector<16xindex> + %src_tdesc = xegpu.create_tdesc %srcce, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr, #sg_map> + %dst_tdesc = xegpu.create_tdesc %dstte, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr, #sg_map> + + %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1> + %loaded = xegpu.load %src_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr, #sg_map>, vector<16xi1> -> vector<1xf32> + + %tid_x = gpu.thread_id x + %tid_x_i32 = arith.index_cast %tid_x : index to i32 + %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32 + %loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<1xf32> + + xegpu.store %loaded_modified, %dst_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr, #sg_map>, vector<16xi1> + gpu.return + } + } + + func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %memref_src = gpu.alloc host_shared () : memref<8x16xf32> + memref.copy %src, %memref_src : memref<8x16xf32> to memref<8x16xf32> + %memref_dst = gpu.alloc host_shared () : memref<8x16xf32> + %srcc = memref.memory_space_cast %memref_src : memref<8x16xf32> to memref<8x16xf32, 1> + %dstt = memref.memory_space_cast %memref_dst : memref<8x16xf32> to memref<8x16xf32, 1> + %srccc = memref.reinterpret_cast %srcc to offset: [0], sizes: [128], + strides: [1] : memref<8x16xf32, 1> to memref<128xf32, 1> + %dstte = memref.reinterpret_cast %dstt to offset: [0], sizes: [128], + strides: [1] : memref<8x16xf32, 1> to memref<128xf32, 1> + gpu.launch_func @kernel::@load_store_2d blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%srccc : memref<128xf32, 1>, %dstte : memref<128xf32, 1>) + return %memref_dst : memref<8x16xf32> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %A = memref.alloc() : memref<8x16xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c11_f32 = arith.constant 11.11 : f32 + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32> + } + } + %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32> + %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32> + %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32> + call @printMemrefF32(%A_cast) : (memref<*xf32>) -> () + call @printMemrefF32(%B_cast) : (memref<*xf32>) -> () + + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [11.11{{.*}}] + // CHECK-COUNT-96: 11.11 + // CHECK-NEXT: [11.11{{.*}}] + + // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // CHECK-COUNT-96: 0 + // CHECK-NEXT: [0{{.*}}] + + memref.dealloc %A : memref<8x16xf32> + return + } + func.func private @printMemrefF32(%ptr : memref<*xf32>) +} diff --git a/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_scatter_chunk_size_1_non_contig_offsets.mlir b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_scatter_chunk_size_1_non_contig_offsets.mlir new file mode 100644 index 000000000..76c12be46 --- /dev/null +++ b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_scatter_chunk_size_1_non_contig_offsets.mlir @@ -0,0 +1,78 @@ +// RUN: gc-opt %s --convert-xegpu-to-xevm --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm --convert-arith-to-llvm --finalize-memref-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s + +#sg_map_a_bf16 = #xegpu.sg_map +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + gpu.func @load_store_2d(%src: memref<128xf32, 1>, %dst: memref<128xf32, 1>) kernel { + %srcce = memref.memory_space_cast %src : memref<128xf32, 1> to memref<128xf32> + %dstte = memref.memory_space_cast %dst : memref<128xf32, 1> to memref<128xf32> + + %offsets = arith.constant dense<[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]> : vector<16xindex> + %src_tdesc = xegpu.create_tdesc %srcce, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr, #sg_map_a_bf16> + %dst_tdesc = xegpu.create_tdesc %dstte, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr, #sg_map_a_bf16> + + %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1> + %loaded = xegpu.load %src_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr, #sg_map_a_bf16>, vector<16xi1> -> vector<1xf32> + + %tid_x = gpu.thread_id x + %tid_x_i32 = arith.index_cast %tid_x : index to i32 + %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32 + + %c0 = arith.constant 0 : i32 + %loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<1xf32> + + xegpu.store %loaded_modified, %dst_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr, #sg_map_a_bf16>, vector<16xi1> + gpu.return + } + } + + func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %memref_src = gpu.alloc host_shared () : memref<8x16xf32> + memref.copy %src, %memref_src : memref<8x16xf32> to memref<8x16xf32> + %memref_dst = gpu.alloc host_shared () : memref<8x16xf32> + %srcc = memref.memory_space_cast %memref_src : memref<8x16xf32> to memref<8x16xf32, 1> + %dstt = memref.memory_space_cast %memref_dst : memref<8x16xf32> to memref<8x16xf32, 1> + %srccc = memref.reinterpret_cast %srcc to offset: [0], sizes: [128], + strides: [1] : memref<8x16xf32, 1> to memref<128xf32, 1> + %dstte = memref.reinterpret_cast %dstt to offset: [0], sizes: [128], + strides: [1] : memref<8x16xf32, 1> to memref<128xf32, 1> + gpu.launch_func @kernel::@load_store_2d blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%srccc : memref<128xf32, 1>, %dstte : memref<128xf32, 1>) + return %memref_dst : memref<8x16xf32> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %A = memref.alloc() : memref<8x16xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c11_f32 = arith.constant 11.11 : f32 + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32> + } + } + %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32> + %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32> + %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32> + call @printMemrefF32(%A_cast) : (memref<*xf32>) -> () + call @printMemrefF32(%B_cast) : (memref<*xf32>) -> () + + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [11.11{{.*}}] + // CHECK-COUNT-96: 11.11 + // CHECK-NEXT: [11.11{{.*}}] + + // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK: 0, 0, 1, 0, 2, 0, 3, 0, 4, 0, 5, 0, 6, 0, 7, 0 + // CHECK: 8, 0, 9, 0, 10, 0, 11, 0, 12, 0, 13, 0, 14, 0, 15, 0 + // CHECK-COUNT-80: 0 + // CHECK-NEXT: [0{{.*}}] + + memref.dealloc %A : memref<8x16xf32> + return + } + func.func private @printMemrefF32(%ptr : memref<*xf32>) +} diff --git a/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_scatter_chunk_size_1_update_offset.mlir b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_scatter_chunk_size_1_update_offset.mlir new file mode 100644 index 000000000..c78ccbebb --- /dev/null +++ b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_scatter_chunk_size_1_update_offset.mlir @@ -0,0 +1,80 @@ +// RUN: gc-opt %s --convert-xegpu-to-xevm --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm --convert-arith-to-llvm --finalize-memref-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s + +#sg_map = #xegpu.sg_map +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + gpu.func @load_store_2d(%src: memref<128xf32, 1>, %dst: memref<128xf32, 1>) kernel { + %srcce = memref.memory_space_cast %src : memref<128xf32, 1> to memref<128xf32> + %dstte = memref.memory_space_cast %dst : memref<128xf32, 1> to memref<128xf32> + + %offsets = arith.constant dense<[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]> : vector<16xindex> + %src_tdesc = xegpu.create_tdesc %srcce, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr, #sg_map> + %dst_tdesc = xegpu.create_tdesc %dstte, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr, #sg_map> + + %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1> + %loaded = xegpu.load %src_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr, #sg_map>, vector<16xi1> -> vector<1xf32> + + %tid_x = gpu.thread_id x + %tid_x_i32 = arith.index_cast %tid_x : index to i32 + %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32 + %loaded_modified = vector.insert %tid_x_f32, %loaded[0] : f32 into vector<1xf32> + + xegpu.store %loaded_modified, %dst_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr, #sg_map>, vector<16xi1> + + %update_offset = arith.constant dense<[16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]> : vector<16xindex> + %dst_tdesc_new = xegpu.update_offset %dst_tdesc, %update_offset : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr, #sg_map>, vector<16xindex> + xegpu.store %loaded_modified, %dst_tdesc_new, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : vector<1xf32>, !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr, #sg_map>, vector<16xi1> + gpu.return + } + } + + func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %memref_src = gpu.alloc host_shared () : memref<8x16xf32> + memref.copy %src, %memref_src : memref<8x16xf32> to memref<8x16xf32> + %memref_dst = gpu.alloc host_shared () : memref<8x16xf32> + %srcc = memref.memory_space_cast %memref_src : memref<8x16xf32> to memref<8x16xf32, 1> + %dstt = memref.memory_space_cast %memref_dst : memref<8x16xf32> to memref<8x16xf32, 1> + %srccc = memref.reinterpret_cast %srcc to offset: [0], sizes: [128], + strides: [1] : memref<8x16xf32, 1> to memref<128xf32, 1> + %dstte = memref.reinterpret_cast %dstt to offset: [0], sizes: [128], + strides: [1] : memref<8x16xf32, 1> to memref<128xf32, 1> + gpu.launch_func @kernel::@load_store_2d blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%srccc : memref<128xf32, 1>, %dstte : memref<128xf32, 1>) + return %memref_dst : memref<8x16xf32> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %A = memref.alloc() : memref<8x16xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c11_f32 = arith.constant 11.11 : f32 + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32> + } + } + %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32> + %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32> + %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32> + call @printMemrefF32(%A_cast) : (memref<*xf32>) -> () + call @printMemrefF32(%B_cast) : (memref<*xf32>) -> () + + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [11.11{{.*}}] + // CHECK-COUNT-96: 11.11 + // CHECK-NEXT: [11.11{{.*}}] + + // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // CHECK-NEXT: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // CHECK-COUNT-80: 0 + // CHECK-NEXT: [0{{.*}}] + + memref.dealloc %A : memref<8x16xf32> + return + } + func.func private @printMemrefF32(%ptr : memref<*xf32>) +} diff --git a/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_scatter_chunk_size_2.mlir b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_scatter_chunk_size_2.mlir new file mode 100644 index 000000000..4666487c0 --- /dev/null +++ b/test/mlir/test/gc/cpu-runner/GPU/XeGPUToXeVM/loadstore_scatter_chunk_size_2.mlir @@ -0,0 +1,76 @@ +// RUN: gc-opt %s --convert-xegpu-to-xevm --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm --convert-arith-to-llvm --finalize-memref-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s + +#sg_map_a_bf16 = #xegpu.sg_map +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + gpu.func @load_store_2d(%src: memref<128xf32, 1>, %dst: memref<128xf32, 1>) kernel { + %srcce = memref.memory_space_cast %src : memref<128xf32, 1> to memref<128xf32> + %dstte = memref.memory_space_cast %dst : memref<128xf32, 1> to memref<128xf32> + + %offsets = arith.constant dense<[0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30]> : vector<16xindex> + %src_tdesc = xegpu.create_tdesc %srcce, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr, #sg_map_a_bf16> + %dst_tdesc = xegpu.create_tdesc %dstte, %offsets : memref<128xf32>, vector<16xindex> -> !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr, #sg_map_a_bf16> + + %mask = arith.constant dense<[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]> : vector<16xi1> + %loaded = xegpu.load %src_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr, #sg_map_a_bf16>, vector<16xi1> -> vector<2x1xf32> + + %tid_x = gpu.thread_id x + %tid_x_i32 = arith.index_cast %tid_x : index to i32 + %tid_x_f32 = arith.sitofp %tid_x_i32 : i32 to f32 + %loaded_modified = vector.insert %tid_x_f32, %loaded[0,0] : f32 into vector<2x1xf32> + + xegpu.store %loaded_modified, %dst_tdesc, %mask <{l1_hint = #xegpu.cache_hint, l2_hint = #xegpu.cache_hint, transpose}> : vector<2x1xf32>, !xegpu.tensor_desc<16x2xf32, #xegpu.scatter_tdesc_attr, #sg_map_a_bf16>, vector<16xi1> + gpu.return + } + } + + func.func @test(%src : memref<8x16xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + %memref_src = gpu.alloc host_shared () : memref<8x16xf32> + memref.copy %src, %memref_src : memref<8x16xf32> to memref<8x16xf32> + %memref_dst = gpu.alloc host_shared () : memref<8x16xf32> + %srcc = memref.memory_space_cast %memref_src : memref<8x16xf32> to memref<8x16xf32, 1> + %dstt = memref.memory_space_cast %memref_dst : memref<8x16xf32> to memref<8x16xf32, 1> + %srccc = memref.reinterpret_cast %srcc to offset: [0], sizes: [128], + strides: [1] : memref<8x16xf32, 1> to memref<128xf32, 1> + %dstte = memref.reinterpret_cast %dstt to offset: [0], sizes: [128], + strides: [1] : memref<8x16xf32, 1> to memref<128xf32, 1> + gpu.launch_func @kernel::@load_store_2d blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%srccc : memref<128xf32, 1>, %dstte : memref<128xf32, 1>) + return %memref_dst : memref<8x16xf32> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %A = memref.alloc() : memref<8x16xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c11_f32 = arith.constant 11.11 : f32 + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c11_f32, %A[%i, %j] : memref<8x16xf32> + } + } + %B = call @test(%A) : (memref<8x16xf32>) -> memref<8x16xf32> + %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32> + %A_cast = memref.cast %A : memref<8x16xf32> to memref<*xf32> + call @printMemrefF32(%A_cast) : (memref<*xf32>) -> () + call @printMemrefF32(%B_cast) : (memref<*xf32>) -> () + + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [11.11{{.*}}] + // CHECK-COUNT-96: 11.11 + // CHECK-NEXT: [11.11{{.*}}] + + // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK: 0, 11.11, 1, 11.11, 2, 11.11, 3, 11.11, 4, 11.11, 5, 11.11, 6, 11.11, 7, 11.11 + // CHECK: 8, 11.11, 9, 11.11, 10, 11.11, 11, 11.11, 12, 11.11, 13, 11.11, 14, 11.11, 15, 11.11 + // CHECK-COUNT-80: 0 + // CHECK-NEXT: [0{{.*}}] + + memref.dealloc %A : memref<8x16xf32> + return + } + func.func private @printMemrefF32(%ptr : memref<*xf32>) +} diff --git a/test/mlir/test/gc/cpu-runner/GPU/xevm_block_dpas.mlir b/test/mlir/test/gc/cpu-runner/GPU/xevm_block_dpas.mlir index 3f28e68bc..282277d71 100644 --- a/test/mlir/test/gc/cpu-runner/GPU/xevm_block_dpas.mlir +++ b/test/mlir/test/gc/cpu-runner/GPU/xevm_block_dpas.mlir @@ -1,4 +1,4 @@ -// RUN: gc-opt %s --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s +// RUN: gc-opt %s --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s module @gemm attributes {gpu.container_module} { gpu.module @kernel { diff --git a/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store.mlir b/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store.mlir index f4bb29f2a..23939ebf2 100644 --- a/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store.mlir +++ b/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store.mlir @@ -1,4 +1,4 @@ -// RUN: gc-opt %s --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s +// RUN: gc-opt %s --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s module @gemm attributes {gpu.container_module} { @@ -24,7 +24,8 @@ module @gemm attributes {gpu.container_module} { %loaded_f32 = vector.bitcast %loaded : vector<8xi32> to vector<8xf32> %c0 = arith.constant 0 : i32 %thread_x = gpu.thread_id x - %thread_x_i32 = arith.index_cast %thread_x : index to i32 + %thread_x_i64 = arith.index_cast %thread_x : index to i64 + %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32 %thread_x_f32 = arith.sitofp %thread_x_i32 : i32 to f32 %loaded_f32_modified = vector.insertelement %thread_x_f32, %loaded_f32[%c0 : i32] : vector<8xf32> %loaded_modified = vector.bitcast %loaded_f32_modified : vector<8xf32> to vector<8xi32> diff --git a/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store_transpose.mlir b/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store_transpose.mlir new file mode 100644 index 000000000..8fdc64997 --- /dev/null +++ b/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store_transpose.mlir @@ -0,0 +1,113 @@ +// RUN: gc-opt %s --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s + +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel { + %base_width = arith.constant 32 : i32 // bytewidth of the block + %base_height = arith.constant 16 : i32 // number of rows + %base_pitch = arith.constant 32 : i32 // bytewidth of the base row + %x = arith.constant 0 : i32 + %y = arith.constant 0 : i32 + // Normally a work-item loads a vertical slice (↓), but with *transpose* a work-item loads a horizontal slice (→). + // The tile dimension we want to slice must be a multiple of the sub-group size: e.g., we want to slice rows (→), then we need SG_SIZE % tile_height == 0. + %loaded = xevm.blockload2d %src, %base_width, %base_height, %base_pitch, %x, %y {elem_size_in_bits=32, tile_width=8, tile_height=16, v_blocks=1, transpose=true, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> + %loaded_f32 = vector.bitcast %loaded : vector<8xi32> to vector<8xf32> + + %c0 = arith.constant 0 : i32 + %thread_x = gpu.thread_id x + %thread_x_i64 = arith.index_cast %thread_x : index to i64 + %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32 + %thread_x_f32 = arith.sitofp %thread_x_i32 : i32 to f32 + %loaded_f32_modified = vector.insert %thread_x_f32, %loaded_f32[7] : f32 into vector<8xf32> // Use this to see where threadIds end up stored + %loaded_f32_modified_1 = vector.bitcast %loaded_f32_modified : vector<8xf32> to vector<8xi32> + + %base_height_store = arith.constant 8 : i32 // number of rows + %base_width_store = arith.constant 64 : i32 // bytewidth of the block + %base_pitch_store = arith.constant 64 : i32 // bytewidth of the base row + // "Transposed" stores are not available, meaning a work-item can store its vector as a vertical slice (↓). + xevm.blockstore2d %dst, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded {elem_size_in_bits=32, tile_width=16, tile_height=8, v_blocks=1, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xi32>) + gpu.return + } + } + + + func.func @test(%src : memref<16x8xf32>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`) + %memref_src = gpu.alloc host_shared () : memref<16x8xf32> + memref.copy %src, %memref_src : memref<16x8xf32> to memref<16x8xf32> + %src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<16x8xf32> -> index + %src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64 + %src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr + %src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1> + + %memref_dst = gpu.alloc host_shared () : memref<8x16xf32> + %dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<8x16xf32> -> index + %dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64 + %dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr + %dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1> + + gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>) + return %memref_dst : memref<8x16xf32> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %A = memref.alloc() : memref<16x8xf32> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c11_f32 = arith.constant 11.11 : f16 + scf.for %i = %c0 to %c16 step %c1 { + scf.for %j = %c0 to %c8 step %c1 { + %c_10_f = arith.constant 10.0 : f32 + %j_i64 = arith.index_cast %j : index to i64 + %j_i32 = llvm.trunc %j_i64 : i64 to i32 + %j_f32 = arith.sitofp %j_i32 : i32 to f32 + %jj = arith.divf %j_f32, %c_10_f : f32 + + %i_i64 = arith.index_cast %i : index to i64 + %i_i32 = llvm.trunc %i_i64 : i64 to i32 + %i_f32 = arith.sitofp %i_i32 : i32 to f32 + %ii = arith.addf %i_f32, %jj : f32 + memref.store %ii, %A[%i, %j] : memref<16x8xf32> + } + } + %B = call @test(%A) : (memref<16x8xf32>) -> memref<8x16xf32> + %A_cast = memref.cast %A : memref<16x8xf32> to memref<*xf32> + %B_cast = memref.cast %B : memref<8x16xf32> to memref<*xf32> + call @printMemrefF32(%A_cast) : (memref<*xf32>) -> () + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7], + // CHECK-NEXT: [1, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7], + // CHECK-NEXT: [2, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7], + // CHECK-NEXT: [3, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7], + // CHECK-NEXT: [4, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7], + // CHECK-NEXT: [5, 5.1, 5.2, 5.3, 5.4, 5.5, 5.6, 5.7], + // CHECK-NEXT: [6, 6.1, 6.2, 6.3, 6.4, 6.5, 6.6, 6.7], + // CHECK-NEXT: [7, 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7], + // CHECK-NEXT: [8, 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7], + // CHECK-NEXT: [9, 9.1, 9.2, 9.3, 9.4, 9.5, 9.6, 9.7], + // CHECK-NEXT: [10, 10.1, 10.2, 10.3, 10.4, 10.5, 10.6, 10.7], + // CHECK-NEXT: [11, 11.1, 11.2, 11.3, 11.4, 11.5, 11.6, 11.7], + // CHECK-NEXT: [12, 12.1, 12.2, 12.3, 12.4, 12.5, 12.6, 12.7], + // CHECK-NEXT: [13, 13.1, 13.2, 13.3, 13.4, 13.5, 13.6, 13.7], + // CHECK-NEXT: [14, 14.1, 14.2, 14.3, 14.4, 14.5, 14.6, 14.7], + // CHECK-NEXT: [15, 15.1, 15.2, 15.3, 15.4, 15.5, 15.6, 15.7] + + call @printMemrefF32(%B_cast) : (memref<*xf32>) -> () + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + // CHECK-NEXT: [0.1, 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1, 9.1, 10.1, 11.1, 12.1, 13.1, 14.1, 15.1], + // CHECK-NEXT: [0.2, 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2, 9.2, 10.2, 11.2, 12.2, 13.2, 14.2, 15.2], + // CHECK-NEXT: [0.3, 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3, 8.3, 9.3, 10.3, 11.3, 12.3, 13.3, 14.3, 15.3], + // CHECK-NEXT: [0.4, 1.4, 2.4, 3.4, 4.4, 5.4, 6.4, 7.4, 8.4, 9.4, 10.4, 11.4, 12.4, 13.4, 14.4, 15.4], + // CHECK-NEXT: [0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5, 8.5, 9.5, 10.5, 11.5, 12.5, 13.5, 14.5, 15.5], + // CHECK-NEXT: [0.6, 1.6, 2.6, 3.6, 4.6, 5.6, 6.6, 7.6, 8.6, 9.6, 10.6, 11.6, 12.6, 13.6, 14.6, 15.6], + // CHECK-NEXT: [0.7, 1.7, 2.7, 3.7, 4.7, 5.7, 6.7, 7.7, 8.7, 9.7, 10.7, 11.7, 12.7, 13.7, 14.7, 15.7] + + memref.dealloc %A : memref<16x8xf32> + return + } + func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface } +} diff --git a/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store_vnni.mlir b/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store_vnni.mlir new file mode 100644 index 000000000..b3f04a212 --- /dev/null +++ b/test/mlir/test/gc/cpu-runner/GPU/xevm_block_load_store_vnni.mlir @@ -0,0 +1,105 @@ +// RUN: gc-opt %s --convert-xevm-to-llvm --xevm-attach-target --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm --convert-arith-to-llvm --convert-gpu-to-llvm-spv='use-64bit-index=true' --gpu-to-llvm --reconcile-unrealized-casts --cse --gpu-module-to-binary | gc-cpu-runner -e main -entry-point-result=void --shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%opencl_runtime | FileCheck %s + +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + gpu.func @block_load_store(%src: !llvm.ptr<1>, %dst: !llvm.ptr<1>) kernel { + %base_width = arith.constant 32 : i32 // bytewidth of the block + %base_height_load = arith.constant 16 : i32 // number of rows + %base_pitch = arith.constant 32 : i32 // bytewidth of the base row + %x = arith.constant 0 : i32 + %y = arith.constant 0 : i32 + + // Consider the following two loads: + // Normal load: + %loaded = xevm.blockload2d %src, %base_width, %base_height_load, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=16, v_blocks=1, transpose=false, vnni_transform=false, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<16xi16> + %loaded_f16_flat = vector.bitcast %loaded : vector<16xi16> to vector<16xf16> + %loaded_f16 = vector.shape_cast %loaded_f16_flat : vector<16xf16> to vector<8x1x2xf16> + + // VNNI load: + %loaded_vnni = xevm.blockload2d %src, %base_width, %base_height_load, %base_pitch, %x, %y {elem_size_in_bits=16, tile_width=16, tile_height=16, v_blocks=1, transpose=false, vnni_transform=true, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32) -> vector<8xi32> + %loaded_vnni_f16_flat = vector.bitcast %loaded_vnni : vector<8xi32> to vector<16xf16> + %loaded_vnni_f16 = vector.shape_cast %loaded_vnni_f16_flat : vector<16xf16> to vector<8x1x2xf16> + // Both can be represented the same way in code as vector<16xf16>. + // A normal load pads a value to a dword (e.g., 32-bit) when loaded to a register. + // VNNI load "packs" multiple sub-dword values along the column (↓), allowing a single register to hold multiple values. + // In SIMT, a work-item reads values along the column (↓), hence a sequence of values loaded using VNNI is logically equivalent to the sequence of values loaded using a normal load. + // The load results of both methods can have the same logical representation, but are expected to differ in physical layout and register efficiency. + + %thread_x = gpu.thread_id x + %thread_x_i64 = arith.index_cast %thread_x : index to i64 + %thread_x_i32 = llvm.trunc %thread_x_i64 : i64 to i32 + %thread_x_f16 = arith.sitofp %thread_x_i32 : i32 to f16 + %loaded_f16_modified = vector.insert %thread_x_f16, %loaded_vnni_f16 [0,0,1] : f16 into vector<8x1x2xf16> // Both loaded_vnni_f16 and loaded_f16 can be used here + // We can only store [1,2,4,8]x[16] shapes for f16, so we have to do 2 stores + %loaded_f16_modified_slice_0 = vector.extract_strided_slice %loaded_f16_modified {offsets = [0, 0, 0], sizes = [4, 1, 2], strides = [1, 1, 1]} : vector<8x1x2xf16> to vector<4x1x2xf16> + %loaded_f16_modified_slice_0_flat = vector.shape_cast %loaded_f16_modified_slice_0 : vector<4x1x2xf16> to vector<8xf16> + %base_height_store = arith.constant 8 : i32 // number of rows + %base_width_store = arith.constant 32 : i32 // bytewidth of the block + %base_pitch_store = arith.constant 32 : i32 // bytewidth of the base row + xevm.blockstore2d %dst, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded_f16_modified_slice_0_flat {elem_size_in_bits=16, tile_width=16, tile_height=8, v_blocks=1, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xf16>) + + %loaded_f16_modified_slice_1 = vector.extract_strided_slice %loaded_f16_modified {offsets = [4, 0, 0], sizes = [4, 1, 2], strides = [1, 1, 1]} : vector<8x1x2xf16> to vector<4x1x2xf16> + %loaded_f16_modified_slice_1_flat = vector.shape_cast %loaded_f16_modified_slice_1 : vector<4x1x2xf16> to vector<8xf16> + + %second_half_offset = arith.muli %base_pitch_store, %base_height_store : i32 + %second_half_ptr = llvm.getelementptr %dst[%second_half_offset] : (!llvm.ptr<1>, i32) -> !llvm.ptr<1>, i8 + xevm.blockstore2d %second_half_ptr, %base_width_store, %base_height_store, %base_pitch_store, %x, %y, %loaded_f16_modified_slice_1_flat {elem_size_in_bits=16, tile_width=16, tile_height=8, v_blocks=1, l1_cache_control=Default, l3_cache_control=Default} : (!llvm.ptr<1>, i32, i32, i32, i32, i32, vector<8xf16>) + gpu.return + } + } + + + func.func @test(%src : memref<16x16xf16>) -> memref<16x16xf16> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index // Multiple of the *maximum sub-group size* (see `intel_reqd_sub_group_size`) + %memref_src = gpu.alloc host_shared () : memref<16x16xf16> + memref.copy %src, %memref_src : memref<16x16xf16> to memref<16x16xf16> + %src_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_src : memref<16x16xf16> -> index + %src_ptr_as_i64 = arith.index_cast %src_ptr_as_idx : index to i64 + %src_ptr = llvm.inttoptr %src_ptr_as_i64 : i64 to !llvm.ptr + %src_ptr_casted = llvm.addrspacecast %src_ptr : !llvm.ptr to !llvm.ptr<1> + + %memref_dst = gpu.alloc host_shared () : memref<16x16xf16> + %dst_ptr_as_idx = memref.extract_aligned_pointer_as_index %memref_dst : memref<16x16xf16> -> index + %dst_ptr_as_i64 = arith.index_cast %dst_ptr_as_idx : index to i64 + %dst_ptr = llvm.inttoptr %dst_ptr_as_i64 : i64 to !llvm.ptr + %dst_ptr_casted = llvm.addrspacecast %dst_ptr : !llvm.ptr to !llvm.ptr<1> + + gpu.launch_func @kernel::@block_load_store blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) args(%src_ptr_casted : !llvm.ptr<1>, %dst_ptr_casted : !llvm.ptr<1>) + return %memref_dst : memref<16x16xf16> + } + + func.func @main() attributes {llvm.emit_c_interface} { + %A = memref.alloc() : memref<16x16xf16> + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 16 : index + %c16 = arith.constant 16 : index + %c11_f32 = arith.constant 11.1 : f16 + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c11_f32, %A[%i, %j] : memref<16x16xf16> + } + } + %B = call @test(%A) : (memref<16x16xf16>) -> memref<16x16xf16> + %B_cast = memref.cast %B : memref<16x16xf16> to memref<*xf16> + %A_cast = memref.cast %A : memref<16x16xf16> to memref<*xf16> + call @printMemrefF16(%A_cast) : (memref<*xf16>) -> () + call @printMemrefF16(%B_cast) : (memref<*xf16>) -> () + + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [11.1{{.*}}] + // CHECK-COUNT-224: 11.1 + // CHECK-NEXT: [11.1{{.*}}] + + // CHECK-NEXT: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-NEXT: [11.1{{.*}}] + // CHECK: 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 + // CHECK-COUNT-208: 11.1 + // CHECK-NEXT: [11.1{{.*}}] + + memref.dealloc %A : memref<16x16xf16> + return + } + func.func private @printMemrefF16(%ptr : memref<*xf16>) attributes { llvm.emit_c_interface } +} diff --git a/test/mlir/test/gc/cpu-runner/tid.mlir b/test/mlir/test/gc/cpu-runner/tid.mlir index aedcc0a20..ff0fcd451 100644 --- a/test/mlir/test/gc/cpu-runner/tid.mlir +++ b/test/mlir/test/gc/cpu-runner/tid.mlir @@ -1,3 +1,4 @@ +// UNSUPPORTED: target={{.*}} // RUN: gc-opt %s --convert-cpuruntime-to-llvm --convert-openmp-to-llvm --convert-func-to-llvm --convert-arith-to-llvm --convert-cf-to-llvm --reconcile-unrealized-casts | gc-cpu-runner -e main -entry-point-result=void | FileCheck %s module { func.func private @omp_get_thread_num() -> i32 diff --git a/test/mlir/unittests/Analysis/TargetDescriptionAnalysisTest.cpp b/test/mlir/unittests/Analysis/TargetDescriptionAnalysisTest.cpp index a3ba8261b..518c50526 100644 --- a/test/mlir/unittests/Analysis/TargetDescriptionAnalysisTest.cpp +++ b/test/mlir/unittests/Analysis/TargetDescriptionAnalysisTest.cpp @@ -26,12 +26,12 @@ using namespace mlir; static const char code1[] = R"mlir( module attributes { dlti.target_system_spec = #dlti.target_system_spec< -"CPU": #dlti.target_device_spec< - #dlti.dl_entry<"L1_cache_size_in_bytes", 49152 : ui32>, - #dlti.dl_entry<"L2_cache_size_in_bytes", 2097152 : ui64>, - #dlti.dl_entry<"L3_cache_size_in_bytes", "110100480">, - #dlti.dl_entry<"num_threads", 56 : i32>, - #dlti.dl_entry<"max_vector_width", 512 : i64>> +"CPU" = #dlti.target_device_spec< + "L1_cache_size_in_bytes" = 49152 : ui32, + "L2_cache_size_in_bytes" = 2097152 : ui64, + "L3_cache_size_in_bytes" = "110100480", + "num_threads" = 56 : i32, + "max_vector_width" = 512 : i64> >} {} )mlir"; @@ -56,9 +56,9 @@ TEST(TargetDescriptionAnalysis, CPUNormal) { static const char code2[] = R"mlir( module attributes { dlti.target_system_spec = #dlti.target_system_spec< -"CPU": #dlti.target_device_spec< - #dlti.dl_entry<"L1_cache_size_in_bytes", 49152 : ui32>, - #dlti.dl_entry<"L2_cache_size_in_bytes", 2097152 : ui32>> +"CPU" = #dlti.target_device_spec< + "L1_cache_size_in_bytes" = 49152 : ui32>, + "L2_cache_size_in_bytes" = 2097152 : ui32> >} {} )mlir"; diff --git a/test/mlir/unittests/ExecutionEngine/IMEX/IMEXGpuOclRuntimeTest.cpp b/test/mlir/unittests/ExecutionEngine/IMEX/IMEXGpuOclRuntimeTest.cpp index d2d15d8a4..ba92536e7 100644 --- a/test/mlir/unittests/ExecutionEngine/IMEX/IMEXGpuOclRuntimeTest.cpp +++ b/test/mlir/unittests/ExecutionEngine/IMEX/IMEXGpuOclRuntimeTest.cpp @@ -62,7 +62,7 @@ module @test { )mlir"; constexpr char matmulAddStatic[] = R"mlir( -module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" : #dlti.target_device_spec<#dlti.dl_entry<"max_work_group_size", 16 : i64>>>} { +module @fragment_name attributes {"#dlti.sys_spec" = #dlti.target_system_spec<"GPU" = #dlti.target_device_spec<"max_work_group_size" = 16 : i64>>} { func.func @entry(%arg0: memref<128x256xf16>, %arg1: memref<256x256xf16>, %arg2: memref<128x256xf16>) { %0 = bufferization.to_tensor %arg0 restrict : memref<128x256xf16> %1 = bufferization.to_tensor %arg1 restrict : memref<256x256xf16>