llvm · Lancern · Feb 17, 2025 · Dec 29, 2024
@@ -3563,6 +3563,63 @@ def LLVMIntrinsicCallOp : CIR_Op<"llvm.intrinsic"> {
 
 }
 
+//===----------------------------------------------------------------------===//
+// InvariantGroupOp
+//===----------------------------------------------------------------------===//
+
+def InvariantGroupOp
+    : CIR_Op<"invariant_group", [Pure, SameOperandsAndResultType]> {
+  let summary = "Start an invariant group";
+  let description = [{
+    The `cir.invariant_group` operation takes a single pointer value as argument
+    and returns the same pointer value with fresh [invariant group] information.
+    All loads and stores that access the returned pointer value are presumed by
+    the optimizer to load or store the same value.
+
+    [invariant group]: https://llvm.org/docs/LangRef.html#invariant-group-metadata
+
+    This operation is not emitted during CIRGen. Instead, it is created when
+    hoisting constant alloca operations to the entry block of a function. This
+    operation effectively marks the syntactic scope of the constant local
+    variable represented by the hosited alloca operation, and it allows for
+    better LLVMIR generation with potentially more optimizations.
+
+    For example, if we have the following CIR before alloca hoisting:
+
+    ```mlir
+    cir.func @foo() {
+      cir.scope {
+        %0 = cir.alloca !s32i : !cir.ptr<!s32i>
+        use(%0)
+      }
+    }
+    ```
+
+    After alloca hoisting:
+
+    ```mlir
+    cir.func @foo() {
+      %0 = cir.alloca !s32i : !cir.ptr<!s32i>
+      cir.scope {
+        %1 = cir.invariant_group %0 : !cir.ptr<!s32i>
+        use(%1)
+      }
+    }
+    ```
+
+    During LLVMIR lowering, load and store operations whose pointer operand
+    comes from `cir.invariant_group` are lowered to corresponding LLVM
+    instructions with invariant group metadata attached.
+  }];
+
+  let arguments = (ins CIR_PointerType:$ptr);
+  let results = (outs CIR_PointerType:$result);
+
+  let assemblyFormat = [{
+    $ptr `:` type($result) attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // DeleteArrayOp
 //===----------------------------------------------------------------------===//

@@ -28,7 +28,142 @@ struct HoistAllocasPass : public HoistAllocasBase<HoistAllocasPass> {
   void runOnOperation() override;
 };
 
-static void process(cir::FuncOp func) {
+static bool isOpInLoop(mlir::Operation *op) {
+  return op->getParentOfType<cir::LoopOpInterface>();
+}
+
+static bool hasStoreToAllocaInWhileCond(cir::AllocaOp alloca) {
+  // This function determines whether the given alloca operation represents
+  // a variable defined as a while loop's condition.
+  //
+  // Specifically, C/C++ allows the condition of a while loop be a variable
+  // declaration:
+  //
+  //   while (const int x = foo()) { /* body... */ }
+  //
+  // CIRGen would emit the following CIR for the above code:
+  //
+  //   cir.scope {
+  //     %x.slot = cir.alloca !s32i [init, const]
+  //     cir.while {
+  //       %0 = cir.call @foo()
+  //       cir.store %0, %x
+  //       %1 = cir.load %x
+  //       %2 = cir.cast int_to_bool %1
+  //       cir.condition(%2)
+  //     } do {
+  //       // loop body goes here.
+  //     }
+  //   }
+  //
+  // Note that %x.slot is emitted outside the cir.while operation. Ideally, the
+  // cir.while operation should cover this cir.alloca operation, but currently
+  // CIR does not work this way. When hoisting such an alloca operation, one
+  // must remove the "const" flag from it, otherwise LLVM lowering code will
+  // mistakenly attach invariant group metadata to the load and store operations
+  // in the while body, indicating that all loads and stores across all
+  // iterations of the loop are constant.
+
+  for (mlir::Operation *user : alloca->getUsers()) {
+    if (!mlir::isa<cir::StoreOp>(user))
+      continue;
+
+    auto store = mlir::cast<cir::StoreOp>(user);
+    mlir::Operation *storeParentOp = store->getParentOp();
+    if (!mlir::isa<cir::WhileOp>(storeParentOp))
+      continue;
+
+    auto whileOp = mlir::cast<cir::WhileOp>(storeParentOp);
+    return &whileOp.getCond() == store->getParentRegion();
+  }
+
+  return false;
+}
+
+static void processConstAlloca(cir::AllocaOp alloca) {
+  // When optimization is enabled, LLVM lowering would start emitting invariant
+  // group metadata for loads and stores to alloca-ed objects with "const"
+  // attribute. For example, the following CIR:
+  //
+  //   %slot = cir.alloca !s32i [init, const]
+  //   cir.store %0, %slot
+  //   %1 = cir.load %slot
+  //
+  // would be lowered to the following LLVM IR:
+  //
+  //   %slot = alloca i32, i64 1
+  //   store i32 %0, ptr %slot, !invariant.group !0
+  //   %1 = load i32, ptr %slot, !invariant.group !0
+  //
+  // The invariant group metadata would tell LLVM optimizer that the store and
+  // load instruction would store and load the same value from %slot.
+  //
+  // So far so good. Things started to get tricky when such an alloca operation
+  // appears in the body of a loop construct:
+  //
+  //   cir.some_loop_construct {
+  //     %slot = cir.alloca !s32i [init, const]
+  //     cir.store %0, %slot
+  //     %1 = cir.load %slot
+  //   }
+  //
+  // After alloca hoisting, the CIR code above would be transformed into:
+  //
+  //   %slot = cir.alloca !s32i [init, const]
+  //   cir.some_loop_construct {
+  //     cir.store %0, %slot
+  //     %1 = cir.load %slot
+  //   }
+  //
+  // Notice how alloca hoisting change the semantics of the program in such a
+  // case. The transformed code now indicates the optimizer that the load and
+  // store operations load and store the same value **across all iterations of
+  // the loop**!
+  //
+  // To overcome this problem, we instead transform the program into this:
+  //
+  //   %slot = cir.alloca !s32i [init, const]
+  //   cir.some_loop_construct {
+  //     %slot.inv = cir.invariant_group %slot
+  //     cir.store %0, %slot.inv
+  //     %1 = cir.load %slot.inv
+  //   }
+  //
+  // The cir.invariant_group operation attaches fresh invariant information to
+  // the operand pointer and yields a pointer with the fresh invariant
+  // information. Upon each loop iteration, the old invariant information is
+  // disgarded, and a new invariant information is attached, thus the correct
+  // program semantic retains. During LLVM lowering, the cir.invariant_group
+  // operation would eventually become an intrinsic call to
+  // @llvm.launder.invariant.group.
+
+  if (isOpInLoop(alloca)) {
+    // Mark the alloca-ed pointer as invariant via the cir.invariant_group
+    // operation.
+    mlir::OpBuilder builder(alloca);
+    auto invariantGroupOp =
+        builder.create<cir::InvariantGroupOp>(alloca.getLoc(), alloca);
+
+    // And replace all uses of the original alloca-ed pointer with the marked
+    // pointer (which carries invariant group information).
+    alloca->replaceUsesWithIf(
+        invariantGroupOp,
+        [op = invariantGroupOp.getOperation()](mlir::OpOperand &use) {
+          return use.getOwner() != op;
+        });
+  } else if (hasStoreToAllocaInWhileCond(alloca)) {
+    // The alloca represents a variable declared as the condition of a while
+    // loop. In CIR, the alloca would be emitted at a scope outside of the
+    // while loop. We have to remove the constant flag during hoisting,
+    // otherwise we would be telling the optimizer that the alloca-ed value
+    // is constant across all iterations of the while loop.
+    //
+    // See the body of the isWhileCondition function for more details.
+    alloca.setConstant(false);
+  }
+}
+
+static void process(mlir::ModuleOp mod, cir::FuncOp func) {
   if (func.getRegion().empty())
     return;
 
@@ -47,25 +182,35 @@ static void process(cir::FuncOp func) {
     return;
 
   mlir::Operation *insertPoint = &*entryBlock.begin();
+  auto optInfoAttr = mlir::cast_if_present<cir::OptInfoAttr>(
+      mod->getAttr(cir::CIRDialect::getOptInfoAttrName()));
+  unsigned optLevel = optInfoAttr ? optInfoAttr.getLevel() : 0;
 
   for (auto alloca : allocas) {
-    alloca->moveBefore(insertPoint);
     if (alloca.getConstant()) {
-      // Hoisted alloca may come from the body of a loop, in which case the
-      // stack slot is re-used by multiple objects alive in different iterations
-      // of the loop. In theory, each of these objects are still constant within
-      // their lifetimes, but currently we're not emitting metadata to further
-      // describe this. So for now let's behave conservatively and remove the
-      // const flag on nested allocas when hoisting them.
-      alloca.setConstant(false);
+      if (optLevel == 0) {
+        // Under non-optimized builds, just remove the constant flag.
+        alloca.setConstant(false);
+        continue;
+      }
+
+      processConstAlloca(alloca);
     }
+
+    alloca->moveBefore(insertPoint);
   }
 }
 
 void HoistAllocasPass::runOnOperation() {
   llvm::TimeTraceScope scope("Hoist Allocas");
   llvm::SmallVector<Operation *, 16> ops;
-  getOperation()->walk([&](cir::FuncOp op) { process(op); });
+
+  Operation *op = getOperation();
+  auto mod = mlir::dyn_cast<mlir::ModuleOp>(op);
+  if (!mod)
+    mod = op->getParentOfType<mlir::ModuleOp>();
+
+  getOperation()->walk([&](cir::FuncOp op) { process(mod, op); });
 }
 
 } // namespace

@@ -1612,6 +1612,15 @@ getLLVMMemOrder(std::optional<cir::MemOrder> &memorder) {
   llvm_unreachable("unknown memory order");
 }
 
+static bool isLoadOrStoreInvariant(mlir::Value addr) {
+  if (auto addrAllocaOp =
+          mlir::dyn_cast_if_present<cir::AllocaOp>(addr.getDefiningOp()))
+    return addrAllocaOp.getConstant();
+  if (mlir::isa_and_present<cir::InvariantGroupOp>(addr.getDefiningOp()))
+    return true;
+  return false;
+}
+
 mlir::LogicalResult CIRToLLVMLoadOpLowering::matchAndRewrite(
     cir::LoadOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
@@ -1631,12 +1640,8 @@ mlir::LogicalResult CIRToLLVMLoadOpLowering::matchAndRewrite(
   auto invariant = false;
   // Under -O1 or higher optimization levels, add the invariant metadata if the
   // load operation loads from a constant object.
-  if (lowerMod &&
-      lowerMod->getContext().getCodeGenOpts().OptimizationLevel > 0) {
-    auto addrAllocaOp =
-        mlir::dyn_cast_if_present<cir::AllocaOp>(op.getAddr().getDefiningOp());
-    invariant = addrAllocaOp && addrAllocaOp.getConstant();
-  }
+  if (lowerMod && lowerMod->getContext().getCodeGenOpts().OptimizationLevel > 0)
+    invariant = isLoadOrStoreInvariant(op.getAddr());
 
   // TODO: nontemporal, syncscope.
   auto newLoad = rewriter.create<mlir::LLVM::LoadOp>(
@@ -1674,12 +1679,8 @@ mlir::LogicalResult CIRToLLVMStoreOpLowering::matchAndRewrite(
   auto invariant = false;
   // Under -O1 or higher optimization levels, add the invariant metadata if the
   // store operation stores to a constant object.
-  if (lowerMod &&
-      lowerMod->getContext().getCodeGenOpts().OptimizationLevel > 0) {
-    auto addrAllocaOp =
-        mlir::dyn_cast_if_present<cir::AllocaOp>(op.getAddr().getDefiningOp());
-    invariant = addrAllocaOp && addrAllocaOp.getConstant();
-  }
+  if (lowerMod && lowerMod->getContext().getCodeGenOpts().OptimizationLevel > 0)
+    invariant = isLoadOrStoreInvariant(op.getAddr());
 
   // Convert adapted value to its memory type if needed.
   mlir::Value value = emitToMemory(rewriter, dataLayout,
@@ -3666,6 +3667,20 @@ mlir::LogicalResult CIRToLLVMInlineAsmOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMInvariantGroupOpLowering::matchAndRewrite(
+    cir::InvariantGroupOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  if (!lowerMod ||
+      lowerMod->getContext().getCodeGenOpts().OptimizationLevel == 0) {
+    rewriter.replaceOp(op, adaptor.getPtr());
+    return mlir::success();
+  }
+
+  rewriter.replaceOpWithNewOp<mlir::LLVM::LaunderInvariantGroupOp>(
+      op, adaptor.getPtr());
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMPrefetchOpLowering::matchAndRewrite(
     cir::PrefetchOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
@@ -4107,7 +4122,8 @@ void populateCIRToLLVMConversionPatterns(
       CIRToLLVMBaseDataMemberOpLowering,
       CIRToLLVMCmpOpLowering,
       CIRToLLVMDerivedDataMemberOpLowering,
-      CIRToLLVMGetRuntimeMemberOpLowering
+      CIRToLLVMGetRuntimeMemberOpLowering,
+      CIRToLLVMInvariantGroupOpLowering
       // clang-format on
       >(converter, patterns.getContext(), lowerModule);
   patterns.add<

@@ -975,6 +975,21 @@ class CIRToLLVMInlineAsmOpLowering
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMInvariantGroupOpLowering
+    : public mlir::OpConversionPattern<cir::InvariantGroupOp> {
+  cir::LowerModule *lowerMod;
+
+public:
+  CIRToLLVMInvariantGroupOpLowering(const mlir::TypeConverter &typeConverter,
+                                    mlir::MLIRContext *context,
+                                    cir::LowerModule *lowerModule)
+      : OpConversionPattern(typeConverter, context), lowerMod(lowerModule) {}
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::InvariantGroupOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 class CIRToLLVMPrefetchOpLowering
     : public mlir::OpConversionPattern<cir::PrefetchOp> {
 public: