diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index a08e16df9222..b06c61577150 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -3563,6 +3563,63 @@ def LLVMIntrinsicCallOp : CIR_Op<"llvm.intrinsic"> { } +//===----------------------------------------------------------------------===// +// InvariantGroupOp +//===----------------------------------------------------------------------===// + +def InvariantGroupOp + : CIR_Op<"invariant_group", [Pure, SameOperandsAndResultType]> { + let summary = "Start an invariant group"; + let description = [{ + The `cir.invariant_group` operation takes a single pointer value as argument + and returns the same pointer value with fresh [invariant group] information. + All loads and stores that access the returned pointer value are presumed by + the optimizer to load or store the same value. + + [invariant group]: https://llvm.org/docs/LangRef.html#invariant-group-metadata + + This operation is not emitted during CIRGen. Instead, it is created when + hoisting constant alloca operations to the entry block of a function. This + operation effectively marks the syntactic scope of the constant local + variable represented by the hosited alloca operation, and it allows for + better LLVMIR generation with potentially more optimizations. + + For example, if we have the following CIR before alloca hoisting: + + ```mlir + cir.func @foo() { + cir.scope { + %0 = cir.alloca !s32i : !cir.ptr + use(%0) + } + } + ``` + + After alloca hoisting: + + ```mlir + cir.func @foo() { + %0 = cir.alloca !s32i : !cir.ptr + cir.scope { + %1 = cir.invariant_group %0 : !cir.ptr + use(%1) + } + } + ``` + + During LLVMIR lowering, load and store operations whose pointer operand + comes from `cir.invariant_group` are lowered to corresponding LLVM + instructions with invariant group metadata attached. + }]; + + let arguments = (ins CIR_PointerType:$ptr); + let results = (outs CIR_PointerType:$result); + + let assemblyFormat = [{ + $ptr `:` type($result) attr-dict + }]; +} + //===----------------------------------------------------------------------===// // DeleteArrayOp //===----------------------------------------------------------------------===// diff --git a/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp b/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp index 4b29c7235a02..a4de5f2af3ed 100644 --- a/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp +++ b/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp @@ -28,7 +28,142 @@ struct HoistAllocasPass : public HoistAllocasBase { void runOnOperation() override; }; -static void process(cir::FuncOp func) { +static bool isOpInLoop(mlir::Operation *op) { + return op->getParentOfType(); +} + +static bool hasStoreToAllocaInWhileCond(cir::AllocaOp alloca) { + // This function determines whether the given alloca operation represents + // a variable defined as a while loop's condition. + // + // Specifically, C/C++ allows the condition of a while loop be a variable + // declaration: + // + // while (const int x = foo()) { /* body... */ } + // + // CIRGen would emit the following CIR for the above code: + // + // cir.scope { + // %x.slot = cir.alloca !s32i [init, const] + // cir.while { + // %0 = cir.call @foo() + // cir.store %0, %x + // %1 = cir.load %x + // %2 = cir.cast int_to_bool %1 + // cir.condition(%2) + // } do { + // // loop body goes here. + // } + // } + // + // Note that %x.slot is emitted outside the cir.while operation. Ideally, the + // cir.while operation should cover this cir.alloca operation, but currently + // CIR does not work this way. When hoisting such an alloca operation, one + // must remove the "const" flag from it, otherwise LLVM lowering code will + // mistakenly attach invariant group metadata to the load and store operations + // in the while body, indicating that all loads and stores across all + // iterations of the loop are constant. + + for (mlir::Operation *user : alloca->getUsers()) { + if (!mlir::isa(user)) + continue; + + auto store = mlir::cast(user); + mlir::Operation *storeParentOp = store->getParentOp(); + if (!mlir::isa(storeParentOp)) + continue; + + auto whileOp = mlir::cast(storeParentOp); + return &whileOp.getCond() == store->getParentRegion(); + } + + return false; +} + +static void processConstAlloca(cir::AllocaOp alloca) { + // When optimization is enabled, LLVM lowering would start emitting invariant + // group metadata for loads and stores to alloca-ed objects with "const" + // attribute. For example, the following CIR: + // + // %slot = cir.alloca !s32i [init, const] + // cir.store %0, %slot + // %1 = cir.load %slot + // + // would be lowered to the following LLVM IR: + // + // %slot = alloca i32, i64 1 + // store i32 %0, ptr %slot, !invariant.group !0 + // %1 = load i32, ptr %slot, !invariant.group !0 + // + // The invariant group metadata would tell LLVM optimizer that the store and + // load instruction would store and load the same value from %slot. + // + // So far so good. Things started to get tricky when such an alloca operation + // appears in the body of a loop construct: + // + // cir.some_loop_construct { + // %slot = cir.alloca !s32i [init, const] + // cir.store %0, %slot + // %1 = cir.load %slot + // } + // + // After alloca hoisting, the CIR code above would be transformed into: + // + // %slot = cir.alloca !s32i [init, const] + // cir.some_loop_construct { + // cir.store %0, %slot + // %1 = cir.load %slot + // } + // + // Notice how alloca hoisting change the semantics of the program in such a + // case. The transformed code now indicates the optimizer that the load and + // store operations load and store the same value **across all iterations of + // the loop**! + // + // To overcome this problem, we instead transform the program into this: + // + // %slot = cir.alloca !s32i [init, const] + // cir.some_loop_construct { + // %slot.inv = cir.invariant_group %slot + // cir.store %0, %slot.inv + // %1 = cir.load %slot.inv + // } + // + // The cir.invariant_group operation attaches fresh invariant information to + // the operand pointer and yields a pointer with the fresh invariant + // information. Upon each loop iteration, the old invariant information is + // disgarded, and a new invariant information is attached, thus the correct + // program semantic retains. During LLVM lowering, the cir.invariant_group + // operation would eventually become an intrinsic call to + // @llvm.launder.invariant.group. + + if (isOpInLoop(alloca)) { + // Mark the alloca-ed pointer as invariant via the cir.invariant_group + // operation. + mlir::OpBuilder builder(alloca); + auto invariantGroupOp = + builder.create(alloca.getLoc(), alloca); + + // And replace all uses of the original alloca-ed pointer with the marked + // pointer (which carries invariant group information). + alloca->replaceUsesWithIf( + invariantGroupOp, + [op = invariantGroupOp.getOperation()](mlir::OpOperand &use) { + return use.getOwner() != op; + }); + } else if (hasStoreToAllocaInWhileCond(alloca)) { + // The alloca represents a variable declared as the condition of a while + // loop. In CIR, the alloca would be emitted at a scope outside of the + // while loop. We have to remove the constant flag during hoisting, + // otherwise we would be telling the optimizer that the alloca-ed value + // is constant across all iterations of the while loop. + // + // See the body of the isWhileCondition function for more details. + alloca.setConstant(false); + } +} + +static void process(mlir::ModuleOp mod, cir::FuncOp func) { if (func.getRegion().empty()) return; @@ -47,25 +182,35 @@ static void process(cir::FuncOp func) { return; mlir::Operation *insertPoint = &*entryBlock.begin(); + auto optInfoAttr = mlir::cast_if_present( + mod->getAttr(cir::CIRDialect::getOptInfoAttrName())); + unsigned optLevel = optInfoAttr ? optInfoAttr.getLevel() : 0; for (auto alloca : allocas) { - alloca->moveBefore(insertPoint); if (alloca.getConstant()) { - // Hoisted alloca may come from the body of a loop, in which case the - // stack slot is re-used by multiple objects alive in different iterations - // of the loop. In theory, each of these objects are still constant within - // their lifetimes, but currently we're not emitting metadata to further - // describe this. So for now let's behave conservatively and remove the - // const flag on nested allocas when hoisting them. - alloca.setConstant(false); + if (optLevel == 0) { + // Under non-optimized builds, just remove the constant flag. + alloca.setConstant(false); + continue; + } + + processConstAlloca(alloca); } + + alloca->moveBefore(insertPoint); } } void HoistAllocasPass::runOnOperation() { llvm::TimeTraceScope scope("Hoist Allocas"); llvm::SmallVector ops; - getOperation()->walk([&](cir::FuncOp op) { process(op); }); + + Operation *op = getOperation(); + auto mod = mlir::dyn_cast(op); + if (!mod) + mod = op->getParentOfType(); + + getOperation()->walk([&](cir::FuncOp op) { process(mod, op); }); } } // namespace diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 3ac4de81422b..3b9f1def6db8 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -1612,6 +1612,15 @@ getLLVMMemOrder(std::optional &memorder) { llvm_unreachable("unknown memory order"); } +static bool isLoadOrStoreInvariant(mlir::Value addr) { + if (auto addrAllocaOp = + mlir::dyn_cast_if_present(addr.getDefiningOp())) + return addrAllocaOp.getConstant(); + if (mlir::isa_and_present(addr.getDefiningOp())) + return true; + return false; +} + mlir::LogicalResult CIRToLLVMLoadOpLowering::matchAndRewrite( cir::LoadOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { @@ -1631,12 +1640,8 @@ mlir::LogicalResult CIRToLLVMLoadOpLowering::matchAndRewrite( auto invariant = false; // Under -O1 or higher optimization levels, add the invariant metadata if the // load operation loads from a constant object. - if (lowerMod && - lowerMod->getContext().getCodeGenOpts().OptimizationLevel > 0) { - auto addrAllocaOp = - mlir::dyn_cast_if_present(op.getAddr().getDefiningOp()); - invariant = addrAllocaOp && addrAllocaOp.getConstant(); - } + if (lowerMod && lowerMod->getContext().getCodeGenOpts().OptimizationLevel > 0) + invariant = isLoadOrStoreInvariant(op.getAddr()); // TODO: nontemporal, syncscope. auto newLoad = rewriter.create( @@ -1674,12 +1679,8 @@ mlir::LogicalResult CIRToLLVMStoreOpLowering::matchAndRewrite( auto invariant = false; // Under -O1 or higher optimization levels, add the invariant metadata if the // store operation stores to a constant object. - if (lowerMod && - lowerMod->getContext().getCodeGenOpts().OptimizationLevel > 0) { - auto addrAllocaOp = - mlir::dyn_cast_if_present(op.getAddr().getDefiningOp()); - invariant = addrAllocaOp && addrAllocaOp.getConstant(); - } + if (lowerMod && lowerMod->getContext().getCodeGenOpts().OptimizationLevel > 0) + invariant = isLoadOrStoreInvariant(op.getAddr()); // Convert adapted value to its memory type if needed. mlir::Value value = emitToMemory(rewriter, dataLayout, @@ -3666,6 +3667,20 @@ mlir::LogicalResult CIRToLLVMInlineAsmOpLowering::matchAndRewrite( return mlir::success(); } +mlir::LogicalResult CIRToLLVMInvariantGroupOpLowering::matchAndRewrite( + cir::InvariantGroupOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const { + if (!lowerMod || + lowerMod->getContext().getCodeGenOpts().OptimizationLevel == 0) { + rewriter.replaceOp(op, adaptor.getPtr()); + return mlir::success(); + } + + rewriter.replaceOpWithNewOp( + op, adaptor.getPtr()); + return mlir::success(); +} + mlir::LogicalResult CIRToLLVMPrefetchOpLowering::matchAndRewrite( cir::PrefetchOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { @@ -4107,7 +4122,8 @@ void populateCIRToLLVMConversionPatterns( CIRToLLVMBaseDataMemberOpLowering, CIRToLLVMCmpOpLowering, CIRToLLVMDerivedDataMemberOpLowering, - CIRToLLVMGetRuntimeMemberOpLowering + CIRToLLVMGetRuntimeMemberOpLowering, + CIRToLLVMInvariantGroupOpLowering // clang-format on >(converter, patterns.getContext(), lowerModule); patterns.add< diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h index f5441c7d11ac..5aafd1a2ecab 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h @@ -975,6 +975,21 @@ class CIRToLLVMInlineAsmOpLowering mlir::ConversionPatternRewriter &) const override; }; +class CIRToLLVMInvariantGroupOpLowering + : public mlir::OpConversionPattern { + cir::LowerModule *lowerMod; + +public: + CIRToLLVMInvariantGroupOpLowering(const mlir::TypeConverter &typeConverter, + mlir::MLIRContext *context, + cir::LowerModule *lowerModule) + : OpConversionPattern(typeConverter, context), lowerMod(lowerModule) {} + + mlir::LogicalResult + matchAndRewrite(cir::InvariantGroupOp op, OpAdaptor, + mlir::ConversionPatternRewriter &) const override; +}; + class CIRToLLVMPrefetchOpLowering : public mlir::OpConversionPattern { public: diff --git a/clang/test/CIR/CodeGen/const-alloca.cpp b/clang/test/CIR/CodeGen/const-alloca.cpp index 7cc9a5b57517..cd64a91ecf5d 100644 --- a/clang/test/CIR/CodeGen/const-alloca.cpp +++ b/clang/test/CIR/CodeGen/const-alloca.cpp @@ -5,6 +5,7 @@ int produce_int(); void blackbox(const int &); +void consume(int); void local_const_int() { const int x = produce_int(); @@ -85,3 +86,87 @@ int local_const_optimize() { // LLVM-NEXT: call void @_Z8blackboxRKi(ptr nonnull %[[#slot]]) // LLVM-NEXT: ret i32 %[[#init]] // LLVM-NEXT: } + +int local_scoped_const() { + { + const int x = produce_int(); + blackbox(x); + return x; + } +} + +// CIR-LABEL: @_Z18local_scoped_constv() +// CIR: cir.scope { +// CIR-NEXT: %[[#x_slot:]] = cir.alloca !s32i, !cir.ptr, ["x", init, const] +// CIR-NEXT: %[[#init:]] = cir.call @_Z11produce_intv() : () -> !s32i +// CIR-NEXT: cir.store %[[#init]], %[[#x_slot]] : !s32i, !cir.ptr +// CIR-NEXT: cir.call @_Z8blackboxRKi(%[[#x_slot]]) : (!cir.ptr) -> () +// CIR-NEXT: %[[#x_reload:]] = cir.load %[[#x_slot]] : !cir.ptr, !s32i +// CIR-NEXT: cir.store %[[#x_reload]], %[[#ret_slot:]] : !s32i, !cir.ptr +// CIR-NEXT: %[[#ret:]] = cir.load %[[#ret_slot]] : !cir.ptr, !s32i +// CIR-NEXT: cir.return %[[#ret]] : !s32i +// CIR-NEXT: } +// CIR: } + +// LLVM-LABEL: @_Z18local_scoped_constv() +// LLVM-NEXT: %[[#x_slot:]] = alloca i32, align 4 +// LLVM-NEXT: %[[#init:]] = tail call i32 @_Z11produce_intv() +// LLVM-NEXT: store i32 %[[#init]], ptr %[[#x_slot]], align 4, !tbaa !{{.+}}, !invariant.group !{{.+}} +// LLVM-NEXT: call void @_Z8blackboxRKi(ptr nonnull %[[#x_slot]]) +// LLVM-NEXT: ret i32 %[[#init]] +// LLVM-NEXT: } + +void local_const_in_loop() { + for (int i = 0; i < 10; ++i) { + const int x = produce_int(); + blackbox(x); + consume(x); + } +} + +// CIR-LABEL: @_Z19local_const_in_loopv +// CIR: cir.scope { +// CIR: cir.for : cond { +// CIR: } body { +// CIR-NEXT: cir.scope { +// CIR-NEXT: %[[#x_slot:]] = cir.alloca !s32i, !cir.ptr, ["x", init, const] +// CIR-NEXT: %[[#init:]] = cir.call @_Z11produce_intv() : () -> !s32i +// CIR-NEXT: cir.store %[[#init]], %[[#x_slot]] : !s32i, !cir.ptr +// CIR-NEXT: cir.call @_Z8blackboxRKi(%[[#x_slot]]) : (!cir.ptr) -> () +// CIR-NEXT: %[[#x_reload:]] = cir.load %[[#x_slot]] : !cir.ptr, !s32i +// CIR-NEXT: cir.call @_Z7consumei(%[[#x_reload]]) : (!s32i) -> () +// CIR-NEXT: } +// CIR-NEXT: cir.yield +// CIR-NEXT: } step { +// CIR: } +// CIR-NEXT: } +// CIR-NEXT: cir.return +// CIR-NEXT: } + +// LLVM-LABEL: @_Z19local_const_in_loopv() +// LLVM: %[[#x_ptr:]] = call ptr @llvm.launder.invariant.group.p0(ptr nonnull %1) +// LLVM-NEXT: %[[#init:]] = call i32 @_Z11produce_intv() +// LLVM-NEXT: store i32 %[[#init]], ptr %[[#x_ptr]], align 4, !tbaa !{{.+}}, !invariant.group !{{.+}} +// LLVM-NEXT: call void @_Z8blackboxRKi(ptr nonnull %[[#x_ptr]]) +// LLVM-NEXT: call void @_Z7consumei(i32 %[[#init]]) +// LLVM: } + +void local_const_in_while_condition() { + while (const int x = produce_int()) { + blackbox(x); + } +} + +// LLVM-LABEL: @_Z30local_const_in_while_conditionv() +// LLVM: %[[#x_slot:]] = alloca i32, align 4 +// LLVM-NEXT: %[[#init:]] = tail call i32 @_Z11produce_intv() +// LLVM-NEXT: store i32 %[[#init]], ptr %[[#x_slot]], align 4 +// LLVM-NEXT: %[[loop_cond:.+]] = icmp eq i32 %[[#init]], 0 +// LLVM-NEXT: br i1 %[[loop_cond]], label %{{.+}}, label %[[loop_body:.+]] +// LLVM: [[loop_body]]: +// LLVM-NEXT: call void @_Z8blackboxRKi(ptr nonnull %[[#x_slot]]) +// LLVM-NEXT: %[[#next:]] = call i32 @_Z11produce_intv() +// LLVM-NEXT: store i32 %[[#next]], ptr %[[#x_slot]], align 4 +// LLVM-NEXT: %[[cond:.+]] = icmp eq i32 %[[#next]], 0 +// LLVM-NEXT: br i1 %[[cond]], label %{{.+}}, label %[[loop_body]] +// LLVM: }