From 22d4193fdec06afe6f3a3518480fe6cd32e4f0dc Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Wed, 20 Aug 2025 22:09:11 +0000 Subject: [PATCH 1/6] add unroll pattern and unit test for load_matrix and store_matrix --- .../mlir/Dialect/XeGPU/Transforms/Passes.td | 4 +- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 16 ++++ .../XeGPU/Transforms/XeGPUBlocking.cpp | 12 +-- .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 87 +++++++++++++++++-- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 46 ++++++++++ mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 23 +++++ 6 files changed, 176 insertions(+), 12 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 3a88dae041dd1..ddf6b4ac85a90 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -67,8 +67,8 @@ def XeGPUBlocking: Pass<"xegpu-blocking"> { to a hardware instruction. }]; let dependentDialects = [ - "memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect" - ]; + "memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect", + "index::IndexDialect"]; } #endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index db8608c6d20b8..a40dc74edb200 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -10,6 +10,7 @@ #define MLIR_DIALECT_XEGPU_UTILS_XEGPUUTILS_H_ #include "mlir/IR/BuiltinTypes.h" +#include "mlir/IR/OpDefinition.h" namespace mlir { class VectorType; @@ -18,6 +19,7 @@ class OpResult; class OpBuilder; class ValueRange; class TypeConverter; +class OpFoldResult; namespace xegpu { class LayoutAttr; @@ -128,6 +130,20 @@ void doSCFStructuralTypeConversionWithTensorType(Operation *op, /// if no GPU module parent or XeVM target attribute exists. std::optional getChipStr(Operation *op); +/// Generates element-wise addition ops of two arrays with automatic alignment. +/// When the input arrays have different sizes, the shorter array is +/// right-aligned with the longer array, and the unmatched leading elements from +/// the longer array are preserved unchanged. This is commonly used for offset +/// computation where higher-dimensional offsets need to be added to +/// lower-dimensional adjustments. +/// +/// Example: +/// lhs = [l1, l2, l3], rhs = [r1, r2] +/// Result: [11, l2+r1, l3+r2] +SmallVector addWithRightAligned(OpBuilder &builder, Location loc, + ArrayRef lhs, + ArrayRef rhs); + } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index d82c541f31359..b11f5fe87559b 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -8,6 +8,7 @@ #include "mlir/Dialect/XeGPU/Transforms/Passes.h" +#include "mlir/Dialect/Index/IR/IndexDialect.h" #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" @@ -155,10 +156,10 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const { std::optional> XeGPUBlockingPass::getTileShape(Operation *op) const { if (isa(op)) + xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op)) return getTileShape(op->getOpResult(0)); if (isa(op)) + xegpu::LoadGatherOp, xegpu::StoreMatrixOp>(op)) return getTileShape(op->getOpOperand(0)); if (isa(op)) return getTileShape(op->getOpOperand(1)); @@ -202,17 +203,18 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { bool XeGPUBlockingPass::needsUnroll(Operation *op) const { // skip the op if any of its operands or results has workgroup level layouts - bool hasWgLayoutOperands = + bool hasSgLayoutOperands = llvm::any_of(op->getOpOperands(), [](OpOperand &opr) { xegpu::LayoutAttr layout = xegpu::getLayoutAttr(opr); return layout && layout.isWgLayout(); }); - bool hasWgLayoutResults = + bool hasSgLayoutResults = llvm::any_of(op->getOpResults(), [](OpResult result) { xegpu::LayoutAttr layout = xegpu::getLayoutAttr(result); return layout && layout.isWgLayout(); }); - if (hasWgLayoutOperands || hasWgLayoutResults) { + + if (hasSgLayoutOperands || hasSgLayoutResults) { LDBG() << "skip unrolling for op with workgroup level layout: " << *op; return false; } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index c793b71639e86..219e4e6f44618 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -682,13 +682,90 @@ struct UnrollUpdateOffsetOp : public UnrollPattern { } }; +struct UnrollLoadMatrixOp : public UnrollPattern { + using UnrollPattern::UnrollPattern; + LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op, + PatternRewriter &rewriter) const override { + std::optional> targetShape = getTargetShape(op); + if (!targetShape) + return failure(); + + Location loc = op.getLoc(); + VectorType valueTy = op.getType(); + Type elemTy = valueTy.getElementType(); + ArrayRef shape = valueTy.getShape(); + auto layout = dyn_cast(op.getLayoutAttr()); + + VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy); + + SmallVector mixedOffsets = op.getMixedOffsets(); + SmallVector> offsetsList; + for (SmallVector offsets : + StaticTileOffsetRange(shape, *targetShape)) { + auto adds = xegpu::addWithRightAligned( + rewriter, loc, mixedOffsets, + getAsIndexOpFoldResult(op.getContext(), offsets)); + offsetsList.push_back(adds); + } + + SmallVector newOps; + for (SmallVector offsets : offsetsList) { + auto newOp = rewriter.create( + op.getLoc(), newValueTy, op.getMemDesc(), offsets, + layout.dropInstData()); + newOps.push_back(newOp); + } + Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter); + rewriter.replaceOp(op, castOp); + return success(); + } +}; + +struct UnrollStoreMatrixOp : public UnrollPattern { + using UnrollPattern::UnrollPattern; + LogicalResult matchAndRewrite(xegpu::StoreMatrixOp op, + PatternRewriter &rewriter) const override { + std::optional> targetShape = getTargetShape(op); + if (!targetShape) + return failure(); + + Location loc = op.getLoc(); + VectorType valueTy = op.getData().getType(); + ArrayRef shape = valueTy.getShape(); + auto layout = dyn_cast(op.getLayoutAttr()); + + SmallVector convertedValTypes = + getUnrolledTypes(valueTy, *targetShape); + SmallVector convertedValues = + pack(op.getData(), convertedValTypes, *targetShape, loc, rewriter); + + SmallVector mixedOffsets = op.getMixedOffsets(); + SmallVector> offsetsList; + for (SmallVector offsets : + StaticTileOffsetRange(shape, *targetShape)) { + auto adds = xegpu::addWithRightAligned( + rewriter, loc, mixedOffsets, + getAsIndexOpFoldResult(op.getContext(), offsets)); + offsetsList.push_back(adds); + } + + for (auto [v, offsets] : llvm::zip_equal(convertedValues, offsetsList)) + rewriter.create(loc, v, op.getMemDesc(), offsets, + layout.dropInstData()); + + rewriter.eraseOp(op); + return success(); + } +}; + } // namespace void mlir::xegpu::populateXeGPUUnrollPatterns( RewritePatternSet &patterns, const xegpu::UnrollOptions &options) { - patterns.add(patterns.getContext(), - options); + patterns + .add( + patterns.getContext(), options); } diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 19eedbac0f76b..088e8a8c497d9 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -12,6 +12,7 @@ #include "mlir/Dialect/XeGPU/Utils/XeGPUUtils.h" #include "mlir/Dialect/GPU/IR/GPUDialect.h" +#include "mlir/Dialect/Index/IR/IndexOps.h" #include "mlir/Dialect/LLVMIR/XeVMDialect.h" #include "mlir/Dialect/SCF/Transforms/Patterns.h" #include "mlir/Dialect/Utils/IndexingUtils.h" @@ -133,6 +134,14 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { if (auto loadNd = dyn_cast(defOp)) return getLayoutAttr(loadNd.getTensorDesc()); + // for LoadMatrixOp, the layout is attached to the property of the op + if (auto loadOp = dyn_cast(defOp)) + return dyn_cast_if_present(loadOp.getLayoutAttr()); + + // for StoreMatrixOp, the layout is attached to the property of the op + if (auto storeOp = dyn_cast(defOp)) + return dyn_cast_if_present(storeOp.getLayoutAttr()); + std::string layoutName = getLayoutName(result); if (defOp->hasAttr(layoutName)) return defOp->getAttrOfType(layoutName); @@ -152,6 +161,13 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { xegpu::LayoutAttr xegpu::getLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); + + if (auto loadOp = dyn_cast(op)) + return dyn_cast_if_present(loadOp.getLayoutAttr()); + + if (auto storeOp = dyn_cast(op)) + return dyn_cast_if_present(storeOp.getLayoutAttr()); + std::string layoutName = xegpu::getLayoutName(opr); if (op->hasAttr(layoutName)) return op->getAttrOfType(layoutName); @@ -179,6 +195,8 @@ xegpu::setLayoutAttr(const mlir::OpOperand &operand, void xegpu::setLayoutAttrs(Operation *op, function_ref getLayoutImpl) { op->walk([&](Operation *nestOp) { + if (isa(nestOp)) + return; for (OpOperand &opr : nestOp->getOpOperands()) { auto layout = getLayoutImpl(opr.get()); setLayoutAttr(opr, layout); @@ -424,3 +442,31 @@ std::optional xegpu::getChipStr(Operation *op) { return std::nullopt; } + +/// Generates element-wise addition ops of two arrays with automatic alignment. +/// When the input arrays have different sizes, the shorter array is +/// right-aligned with the longer array, and the unmatched leading elements from +/// the longer array are preserved unchanged. This is commonly used for offset +/// computation where higher-dimensional offsets need to be added to +/// lower-dimensional adjustments. +/// +/// Example: +/// lhs = [l1, l2, l3], rhs = [r1, r2] +/// Result: [11, l2+r1, l3+r2] +SmallVector +xegpu::addWithRightAligned(OpBuilder &builder, Location loc, + ArrayRef lhs, + ArrayRef rhs) { + // ensure a is longer than b + ArrayRef a = lhs.size() >= rhs.size() ? lhs : rhs; + ArrayRef b = lhs.size() >= rhs.size() ? rhs : lhs; + SmallVector results(a.take_front(a.size() - b.size())); + a = a.slice(a.size() - b.size()); + for (auto [l, r] : llvm::zip(a, b)) { + auto lval = getValueOrCreateConstantIndexOp(builder, loc, l); + auto rval = getValueOrCreateConstantIndexOp(builder, loc, r); + results.push_back(builder.createOrFold(loc, lval, rval)); + } + return results; + return {}; +} diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index d986e5bd1cfb4..9d63c2ddd4895 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -561,3 +561,26 @@ gpu.module @test_kernel { gpu.return %e : vector<8x32x2xf16> } } + +// ----- +gpu.module @test_kernel { + //CHECK-LABEL: unroll_load_matrix + gpu.func @unroll_load_matrix(%arg0: memref<4096xi8, 3>) -> vector<32x32xf32> { + %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32> + //CHECK-COUNT-8: xegpu.load_matrix {{.*}} : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x16xf32> + //CHECK-COUNT-8: vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32> + %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32> + gpu.return %1: vector<32x32xf32> + } +} + +// ----- +gpu.module @test_kernel { + // CHECK-LABEL: unroll_store_matrix + gpu.func @unroll_store_matrix(%value: vector<32x32xf32>, %arg0 : memref<32768xi8, 3>) { + %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> + // CHECK-COUNT-8: xegpu.store_matrix {{.*}} : vector<8x16xf32>, !xegpu.mem_desc<64x128xf32>, index, index + xegpu.store_matrix %value, %mdesc[0, 0] {layout = #xegpu.layout} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32> + gpu.return + } +} From d20da858449bfd926df69de3f2b777cae4ee2f24 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Wed, 20 Aug 2025 22:09:11 +0000 Subject: [PATCH 2/6] add unroll pattern and unit test for load_matrix and store_matrix --- .../mlir/Dialect/XeGPU/Transforms/Passes.td | 4 +- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 1 + .../XeGPU/Transforms/XeGPUBlocking.cpp | 12 +-- .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 87 +++++++++++++++++-- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 17 ++++ mlir/test/Dialect/XeGPU/xegpu-blocking.mlir | 23 +++++ 6 files changed, 132 insertions(+), 12 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td index 3a88dae041dd1..ddf6b4ac85a90 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/XeGPU/Transforms/Passes.td @@ -67,8 +67,8 @@ def XeGPUBlocking: Pass<"xegpu-blocking"> { to a hardware instruction. }]; let dependentDialects = [ - "memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect" - ]; + "memref::MemRefDialect", "xegpu::XeGPUDialect", "vector::VectorDialect", + "index::IndexDialect"]; } #endif // MLIR_DIALECT_XEGPU_TRANSFORMS_PASSES_TD diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index b2b2d3ab85231..a40dc74edb200 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -19,6 +19,7 @@ class OpResult; class OpBuilder; class ValueRange; class TypeConverter; +class OpFoldResult; namespace xegpu { class LayoutAttr; diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index b3144e4c1e55d..fb4f00b21f2b9 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -8,6 +8,7 @@ #include "mlir/Dialect/XeGPU/Transforms/Passes.h" +#include "mlir/Dialect/Index/IR/IndexDialect.h" #include "mlir/Dialect/Vector/Transforms/VectorTransforms.h" #include "mlir/Dialect/XeGPU/IR/XeGPU.h" #include "mlir/Dialect/XeGPU/Transforms/Transforms.h" @@ -155,10 +156,10 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const { std::optional> XeGPUBlockingPass::getTileShape(Operation *op) const { if (isa(op)) + xegpu::UpdateOffsetOp, xegpu::LoadMatrixOp>(op)) return getTileShape(op->getOpResult(0)); if (isa(op)) + xegpu::LoadGatherOp, xegpu::StoreMatrixOp>(op)) return getTileShape(op->getOpOperand(0)); if (isa(op)) return getTileShape(op->getOpOperand(1)); @@ -202,17 +203,18 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { bool XeGPUBlockingPass::needsUnroll(Operation *op) const { // skip the op if any of its operands or results has workgroup level layouts - bool hasWgLayoutOperands = + bool hasSgLayoutOperands = llvm::any_of(op->getOpOperands(), [](OpOperand &opr) { xegpu::LayoutAttr layout = xegpu::getLayoutAttr(opr); return layout && layout.isForWorkgroup(); }); - bool hasWgLayoutResults = + bool hasSgLayoutResults = llvm::any_of(op->getOpResults(), [](OpResult result) { xegpu::LayoutAttr layout = xegpu::getLayoutAttr(result); return layout && layout.isForWorkgroup(); }); - if (hasWgLayoutOperands || hasWgLayoutResults) { + + if (hasSgLayoutOperands || hasSgLayoutResults) { LDBG() << "skip unrolling for op with workgroup level layout: " << *op; return false; } diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index c793b71639e86..219e4e6f44618 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -682,13 +682,90 @@ struct UnrollUpdateOffsetOp : public UnrollPattern { } }; +struct UnrollLoadMatrixOp : public UnrollPattern { + using UnrollPattern::UnrollPattern; + LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op, + PatternRewriter &rewriter) const override { + std::optional> targetShape = getTargetShape(op); + if (!targetShape) + return failure(); + + Location loc = op.getLoc(); + VectorType valueTy = op.getType(); + Type elemTy = valueTy.getElementType(); + ArrayRef shape = valueTy.getShape(); + auto layout = dyn_cast(op.getLayoutAttr()); + + VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy); + + SmallVector mixedOffsets = op.getMixedOffsets(); + SmallVector> offsetsList; + for (SmallVector offsets : + StaticTileOffsetRange(shape, *targetShape)) { + auto adds = xegpu::addWithRightAligned( + rewriter, loc, mixedOffsets, + getAsIndexOpFoldResult(op.getContext(), offsets)); + offsetsList.push_back(adds); + } + + SmallVector newOps; + for (SmallVector offsets : offsetsList) { + auto newOp = rewriter.create( + op.getLoc(), newValueTy, op.getMemDesc(), offsets, + layout.dropInstData()); + newOps.push_back(newOp); + } + Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter); + rewriter.replaceOp(op, castOp); + return success(); + } +}; + +struct UnrollStoreMatrixOp : public UnrollPattern { + using UnrollPattern::UnrollPattern; + LogicalResult matchAndRewrite(xegpu::StoreMatrixOp op, + PatternRewriter &rewriter) const override { + std::optional> targetShape = getTargetShape(op); + if (!targetShape) + return failure(); + + Location loc = op.getLoc(); + VectorType valueTy = op.getData().getType(); + ArrayRef shape = valueTy.getShape(); + auto layout = dyn_cast(op.getLayoutAttr()); + + SmallVector convertedValTypes = + getUnrolledTypes(valueTy, *targetShape); + SmallVector convertedValues = + pack(op.getData(), convertedValTypes, *targetShape, loc, rewriter); + + SmallVector mixedOffsets = op.getMixedOffsets(); + SmallVector> offsetsList; + for (SmallVector offsets : + StaticTileOffsetRange(shape, *targetShape)) { + auto adds = xegpu::addWithRightAligned( + rewriter, loc, mixedOffsets, + getAsIndexOpFoldResult(op.getContext(), offsets)); + offsetsList.push_back(adds); + } + + for (auto [v, offsets] : llvm::zip_equal(convertedValues, offsetsList)) + rewriter.create(loc, v, op.getMemDesc(), offsets, + layout.dropInstData()); + + rewriter.eraseOp(op); + return success(); + } +}; + } // namespace void mlir::xegpu::populateXeGPUUnrollPatterns( RewritePatternSet &patterns, const xegpu::UnrollOptions &options) { - patterns.add(patterns.getContext(), - options); + patterns + .add( + patterns.getContext(), options); } diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 6835f64ad8ef7..f77749fd77831 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -134,6 +134,14 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { if (auto loadNd = dyn_cast(defOp)) return getLayoutAttr(loadNd.getTensorDesc()); + // for LoadMatrixOp, the layout is attached to the property of the op + if (auto loadOp = dyn_cast(defOp)) + return dyn_cast_if_present(loadOp.getLayoutAttr()); + + // for StoreMatrixOp, the layout is attached to the property of the op + if (auto storeOp = dyn_cast(defOp)) + return dyn_cast_if_present(storeOp.getLayoutAttr()); + std::string layoutName = getLayoutName(result); if (defOp->hasAttr(layoutName)) return defOp->getAttrOfType(layoutName); @@ -153,6 +161,13 @@ xegpu::LayoutAttr xegpu::getLayoutAttr(const Value value) { xegpu::LayoutAttr xegpu::getLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); + + if (auto loadOp = dyn_cast(op)) + return dyn_cast_if_present(loadOp.getLayoutAttr()); + + if (auto storeOp = dyn_cast(op)) + return dyn_cast_if_present(storeOp.getLayoutAttr()); + std::string layoutName = xegpu::getLayoutName(opr); if (op->hasAttr(layoutName)) return op->getAttrOfType(layoutName); @@ -180,6 +195,8 @@ xegpu::setLayoutAttr(const mlir::OpOperand &operand, void xegpu::setLayoutAttrs(Operation *op, function_ref getLayoutImpl) { op->walk([&](Operation *nestOp) { + if (isa(nestOp)) + return; for (OpOperand &opr : nestOp->getOpOperands()) { auto layout = getLayoutImpl(opr.get()); setLayoutAttr(opr, layout); diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir index d986e5bd1cfb4..9d63c2ddd4895 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir @@ -561,3 +561,26 @@ gpu.module @test_kernel { gpu.return %e : vector<8x32x2xf16> } } + +// ----- +gpu.module @test_kernel { + //CHECK-LABEL: unroll_load_matrix + gpu.func @unroll_load_matrix(%arg0: memref<4096xi8, 3>) -> vector<32x32xf32> { + %0 = xegpu.create_mem_desc %arg0 : memref<4096xi8, 3> -> !xegpu.mem_desc<32x32xf32> + //CHECK-COUNT-8: xegpu.load_matrix {{.*}} : !xegpu.mem_desc<32x32xf32>, index, index -> vector<8x16xf32> + //CHECK-COUNT-8: vector.insert_strided_slice {{.*}} : vector<8x16xf32> into vector<32x32xf32> + %1 = xegpu.load_matrix %0[0, 0] <{layout = #xegpu.layout}>: !xegpu.mem_desc<32x32xf32> -> vector<32x32xf32> + gpu.return %1: vector<32x32xf32> + } +} + +// ----- +gpu.module @test_kernel { + // CHECK-LABEL: unroll_store_matrix + gpu.func @unroll_store_matrix(%value: vector<32x32xf32>, %arg0 : memref<32768xi8, 3>) { + %mdesc = xegpu.create_mem_desc %arg0 : memref<32768xi8, 3> -> !xegpu.mem_desc<64x128xf32> + // CHECK-COUNT-8: xegpu.store_matrix {{.*}} : vector<8x16xf32>, !xegpu.mem_desc<64x128xf32>, index, index + xegpu.store_matrix %value, %mdesc[0, 0] {layout = #xegpu.layout} : vector<32x32xf32>, !xegpu.mem_desc<64x128xf32> + gpu.return + } +} From 442c18aed86ab2423e20401d8c17f9c5b73543b3 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Wed, 27 Aug 2025 17:54:16 +0000 Subject: [PATCH 3/6] merge --- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 95aa1fc58f4f6..2e17e559fdd2d 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -136,11 +136,11 @@ xegpu::DistributeLayoutAttr xegpu::getDistributeLayoutAttr(const Value value) { // for LoadMatrixOp, the layout is attached to the property of the op if (auto loadOp = dyn_cast(defOp)) - return dyn_cast_if_present(loadOp.getLayoutAttr()); + return loadOp.getLayoutAttr(); // for StoreMatrixOp, the layout is attached to the property of the op if (auto storeOp = dyn_cast(defOp)) - return dyn_cast_if_present(storeOp.getLayoutAttr()); + return storeOp.getLayoutAttr(); std::string layoutName = getLayoutName(result); if (defOp->hasAttr(layoutName)) @@ -164,10 +164,10 @@ xegpu::getDistributeLayoutAttr(const OpOperand &opr) { Operation *op = opr.getOwner(); if (auto loadOp = dyn_cast(op)) - return dyn_cast_if_present(loadOp.getLayoutAttr()); + return loadOp.getLayoutAttr(); if (auto storeOp = dyn_cast(op)) - return dyn_cast_if_present(storeOp.getLayoutAttr()); + return storeOp.getLayoutAttr(); std::string layoutName = xegpu::getLayoutName(opr); if (op->hasAttr(layoutName)) @@ -199,6 +199,7 @@ void xegpu::setDistributeLayoutAttrs( op->walk([&](Operation *nestOp) { if (isa(nestOp)) return; + for (OpOperand &opr : nestOp->getOpOperands()) { auto layout = getLayoutImpl(opr.get()); setDistributeLayoutAttr(opr, layout); @@ -471,5 +472,4 @@ xegpu::addWithRightAligned(OpBuilder &builder, Location loc, results.push_back(builder.createOrFold(loc, lval, rval)); } return results; - return {}; } From a368430b6636f3285062c45b7657d9d9103485f7 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Wed, 27 Aug 2025 17:59:56 +0000 Subject: [PATCH 4/6] roll back unnecessary change --- mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp index 1ccb4e89fd6a5..5d5ff69e06886 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp @@ -205,20 +205,19 @@ XeGPUBlockingPass::getTileShape(Operation *op) const { bool XeGPUBlockingPass::needsUnroll(Operation *op) const { // skip the op if any of its operands or results has workgroup level layouts - bool hasSgLayoutOperands = + bool hasWgLayoutOperands = llvm::any_of(op->getOpOperands(), [](OpOperand &opr) { xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(opr); return layout && layout.isForWorkgroup(); }); - bool hasSgLayoutResults = + bool hasWgLayoutResults = llvm::any_of(op->getOpResults(), [](OpResult result) { xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(result); return layout && layout.isForWorkgroup(); }); - - if (hasSgLayoutOperands || hasSgLayoutResults) { + if (hasWgLayoutOperands || hasWgLayoutResults) { LDBG() << "skip unrolling for op with workgroup level layout: " << *op; return false; } From 3f5d69299a65cc1854e4fe502dd2628629a31599 Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Wed, 27 Aug 2025 18:26:19 +0000 Subject: [PATCH 5/6] add unit test --- mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir | 27 +++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir index f4a49da71605f..c0fb373835e3d 100644 --- a/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir +++ b/mlir/test/Dialect/XeGPU/xegpu-wg-to-sg.mlir @@ -26,6 +26,33 @@ gpu.module @test_1_1_assignment { gpu.return } + // CHECK-LABEL: create_nd_tdesc_from_higher_rank_memref + // CHECK-SAME: [[ARG_0:%.*]]: memref<3x256x128xf32> + gpu.func @create_nd_tdesc_from_higher_rank_memref(%src: memref<3x256x128xf32>) { + //CHECK: [[SGID:%.+]] = gpu.subgroup_id : index + //CHECK: [[SGIDY:%.+]] = affine.apply #map()[[[SGID]]] + //CHECK: [[SGIDX:%.+]] = affine.apply #map1()[[[SGID]]] + //CHECK: [[C32:%.+]] = arith.constant 32 : index + //CHECK: [[LY:%.+]] = index.mul [[SGIDY]], [[C32]] + //CHECK: [[LX:%.+]] = index.mul [[SGIDX]], [[C32]] + //CHECK: [[C0:%.+]] = arith.constant 0 : index + //CHECK: [[C0_2:%.+]] = arith.constant 0 : index + //CHECK: [[UY:%.+]] = arith.addi [[LY]], [[C0]] : index + //CHECK: [[UX:%.+]] = arith.addi [[LX]], [[C0_2]] : index + //CHECK: [[C256:%.+]] = arith.constant 256 : index + //CHECK: [[MODY:%.+]] = index.remu [[UY]], [[C256]] + //CHECK: [[C128:%.+]] = arith.constant 128 : index + //CHECK: [[MODX:%.+]] = index.remu [[UX]], [[C128]] + //CHECK: [[C0_3:%.+]] = arith.constant 0 : index + //CHECK: [[Y:%.+]] = index.add [[MODY]], [[C0_3]] + //CHECK: [[C0_4:%.+]] = arith.constant 0 : index + //CHECK: [[X:%.+]] = index.add [[MODX]], [[C0_4]] + //CHECK: [[TDESC:%.+]] = xegpu.create_nd_tdesc [[ARG_0]][1, [[Y]], [[X]]] : memref<3x256x128xf32> -> !xegpu.tensor_desc<32x32xf32, #xegpu.layout> + %tdesc = xegpu.create_nd_tdesc %src[1, 0, 0] : memref<3x256x128xf32> + -> !xegpu.tensor_desc<256x128xf32, #xegpu.layout> + gpu.return + } + // CHECK-LABEL: load_nd_tdesc // CHECK-SAME: %[[ARG_0:.*]]: memref<256x128xf32> gpu.func @load_nd_tdesc(%src: memref<256x128xf32>) { From a0512d9f95f01b79b13192a37899972ad3314dfb Mon Sep 17 00:00:00 2001 From: Chao Chen Date: Wed, 3 Sep 2025 15:09:44 +0000 Subject: [PATCH 6/6] address comments --- .../mlir/Dialect/XeGPU/Utils/XeGPUUtils.h | 6 +++++- .../Dialect/XeGPU/Transforms/XeGPUUnroll.cpp | 14 ++++++------- mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp | 21 ++++++++++++++----- 3 files changed, 28 insertions(+), 13 deletions(-) diff --git a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h index 701d851eade35..04cfd58d846a7 100644 --- a/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h +++ b/mlir/include/mlir/Dialect/XeGPU/Utils/XeGPUUtils.h @@ -144,6 +144,11 @@ void doSCFStructuralTypeConversionWithTensorType(Operation *op, /// if no GPU module parent or XeVM target attribute exists. std::optional getChipStr(Operation *op); +/// Generates element-wise addition ops of two arrays with same length. +SmallVector addElementwise(OpBuilder &builder, Location loc, + ArrayRef lhs, + ArrayRef rhs); + /// Generates element-wise addition ops of two arrays with automatic alignment. /// When the input arrays have different sizes, the shorter array is /// right-aligned with the longer array, and the unmatched leading elements from @@ -157,7 +162,6 @@ std::optional getChipStr(Operation *op); SmallVector addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef lhs, ArrayRef rhs); - } // namespace xegpu } // namespace mlir diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp index 219e4e6f44618..d24d82780ebaa 100644 --- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp +++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp @@ -686,12 +686,12 @@ struct UnrollLoadMatrixOp : public UnrollPattern { using UnrollPattern::UnrollPattern; LogicalResult matchAndRewrite(xegpu::LoadMatrixOp op, PatternRewriter &rewriter) const override { + Location loc = op.getLoc(); + VectorType valueTy = op.getType(); std::optional> targetShape = getTargetShape(op); - if (!targetShape) + if (!targetShape || targetShape->size() != (size_t)valueTy.getRank()) return failure(); - Location loc = op.getLoc(); - VectorType valueTy = op.getType(); Type elemTy = valueTy.getElementType(); ArrayRef shape = valueTy.getShape(); auto layout = dyn_cast(op.getLayoutAttr()); @@ -702,17 +702,17 @@ struct UnrollLoadMatrixOp : public UnrollPattern { SmallVector> offsetsList; for (SmallVector offsets : StaticTileOffsetRange(shape, *targetShape)) { - auto adds = xegpu::addWithRightAligned( + auto adds = xegpu::addElementwise( rewriter, loc, mixedOffsets, getAsIndexOpFoldResult(op.getContext(), offsets)); offsetsList.push_back(adds); } SmallVector newOps; + layout = layout.dropInstData(); for (SmallVector offsets : offsetsList) { auto newOp = rewriter.create( - op.getLoc(), newValueTy, op.getMemDesc(), offsets, - layout.dropInstData()); + op.getLoc(), newValueTy, op.getMemDesc(), offsets, layout); newOps.push_back(newOp); } Value castOp = unpack(newOps, op.getType(), *targetShape, loc, rewriter); @@ -743,7 +743,7 @@ struct UnrollStoreMatrixOp : public UnrollPattern { SmallVector> offsetsList; for (SmallVector offsets : StaticTileOffsetRange(shape, *targetShape)) { - auto adds = xegpu::addWithRightAligned( + auto adds = xegpu::addElementwise( rewriter, loc, mixedOffsets, getAsIndexOpFoldResult(op.getContext(), offsets)); offsetsList.push_back(adds); diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp index 2e17e559fdd2d..b72d5648b29f9 100644 --- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp +++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp @@ -447,6 +447,21 @@ std::optional xegpu::getChipStr(Operation *op) { return std::nullopt; } +/// Generates element-wise addition ops of two arrays with same length. +SmallVector xegpu::addElementwise(OpBuilder &builder, + Location loc, + ArrayRef lhs, + ArrayRef rhs) { + assert(lhs.size() == rhs.size() && "lhs and rhs must have the same size"); + SmallVector results; + for (auto [l, r] : llvm::zip_equal(lhs, rhs)) { + auto lval = getValueOrCreateConstantIndexOp(builder, loc, l); + auto rval = getValueOrCreateConstantIndexOp(builder, loc, r); + results.push_back(builder.createOrFold(loc, lval, rval)); + } + return results; +} + /// Generates element-wise addition ops of two arrays with automatic alignment. /// When the input arrays have different sizes, the shorter array is /// right-aligned with the longer array, and the unmatched leading elements from @@ -466,10 +481,6 @@ xegpu::addWithRightAligned(OpBuilder &builder, Location loc, ArrayRef b = lhs.size() >= rhs.size() ? rhs : lhs; SmallVector results(a.take_front(a.size() - b.size())); a = a.slice(a.size() - b.size()); - for (auto [l, r] : llvm::zip(a, b)) { - auto lval = getValueOrCreateConstantIndexOp(builder, loc, l); - auto rval = getValueOrCreateConstantIndexOp(builder, loc, r); - results.push_back(builder.createOrFold(loc, lval, rval)); - } + results.append(addElementwise(builder, loc, a, b)); return results; }