From c6ac622b172fb1e25798495b4291d355643687a0 Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern@gmail.com>
Date: Sun, 29 Dec 2024 22:04:51 +0800
Subject: [PATCH] [CIR] Lower nested local constant alloca

This patch adds support for lowering local constants in nested scopes, including
those in nested loops.

For those constant allocas in non-loop inner scopes, this patch keeps their
constant flags during alloca hoisting. LLVM lowering would correctly emit
necessary invariant metadata for those allocas.

For those constant allocas in a loop, this patch introduces a new operation
`cir.invariant_group` that marks the beginning of the lifetime of the constant
objects. This operation is put at the location of the alloca operation before
hoisting them. This patch updates LLVM lowering to emit the necessary invariant
metadata when loading and storing through such pointers.

This patch takes care of the special case where the constant alloca represents
a variable declared in the condition part of a while loop. In such a case, this
patch removes the constant flag on the alloca operation when hositing them.
---
 clang/include/clang/CIR/Dialect/IR/CIROps.td  |  57 ++++++
 .../CIR/Dialect/Transforms/HoistAllocas.cpp   | 165 ++++++++++++++++--
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp |  42 +++--
 .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h   |  15 ++
 clang/test/CIR/CodeGen/const-alloca.cpp       |  85 +++++++++
 5 files changed, 341 insertions(+), 23 deletions(-)

diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index a08e16df9222..b06c61577150 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -3563,6 +3563,63 @@ def LLVMIntrinsicCallOp : CIR_Op<"llvm.intrinsic"> {
 
 }
 
+//===----------------------------------------------------------------------===//
+// InvariantGroupOp
+//===----------------------------------------------------------------------===//
+
+def InvariantGroupOp
+    : CIR_Op<"invariant_group", [Pure, SameOperandsAndResultType]> {
+  let summary = "Start an invariant group";
+  let description = [{
+    The `cir.invariant_group` operation takes a single pointer value as argument
+    and returns the same pointer value with fresh [invariant group] information.
+    All loads and stores that access the returned pointer value are presumed by
+    the optimizer to load or store the same value.
+
+    [invariant group]: https://llvm.org/docs/LangRef.html#invariant-group-metadata
+
+    This operation is not emitted during CIRGen. Instead, it is created when
+    hoisting constant alloca operations to the entry block of a function. This
+    operation effectively marks the syntactic scope of the constant local
+    variable represented by the hosited alloca operation, and it allows for
+    better LLVMIR generation with potentially more optimizations.
+
+    For example, if we have the following CIR before alloca hoisting:
+
+    ```mlir
+    cir.func @foo() {
+      cir.scope {
+        %0 = cir.alloca !s32i : !cir.ptr<!s32i>
+        use(%0)
+      }
+    }
+    ```
+
+    After alloca hoisting:
+
+    ```mlir
+    cir.func @foo() {
+      %0 = cir.alloca !s32i : !cir.ptr<!s32i>
+      cir.scope {
+        %1 = cir.invariant_group %0 : !cir.ptr<!s32i>
+        use(%1)
+      }
+    }
+    ```
+
+    During LLVMIR lowering, load and store operations whose pointer operand
+    comes from `cir.invariant_group` are lowered to corresponding LLVM
+    instructions with invariant group metadata attached.
+  }];
+
+  let arguments = (ins CIR_PointerType:$ptr);
+  let results = (outs CIR_PointerType:$result);
+
+  let assemblyFormat = [{
+    $ptr `:` type($result) attr-dict
+  }];
+}
+
 //===----------------------------------------------------------------------===//
 // DeleteArrayOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp b/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp
index 4b29c7235a02..a4de5f2af3ed 100644
--- a/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/HoistAllocas.cpp
@@ -28,7 +28,142 @@ struct HoistAllocasPass : public HoistAllocasBase<HoistAllocasPass> {
   void runOnOperation() override;
 };
 
-static void process(cir::FuncOp func) {
+static bool isOpInLoop(mlir::Operation *op) {
+  return op->getParentOfType<cir::LoopOpInterface>();
+}
+
+static bool hasStoreToAllocaInWhileCond(cir::AllocaOp alloca) {
+  // This function determines whether the given alloca operation represents
+  // a variable defined as a while loop's condition.
+  //
+  // Specifically, C/C++ allows the condition of a while loop be a variable
+  // declaration:
+  //
+  //   while (const int x = foo()) { /* body... */ }
+  //
+  // CIRGen would emit the following CIR for the above code:
+  //
+  //   cir.scope {
+  //     %x.slot = cir.alloca !s32i [init, const]
+  //     cir.while {
+  //       %0 = cir.call @foo()
+  //       cir.store %0, %x
+  //       %1 = cir.load %x
+  //       %2 = cir.cast int_to_bool %1
+  //       cir.condition(%2)
+  //     } do {
+  //       // loop body goes here.
+  //     }
+  //   }
+  //
+  // Note that %x.slot is emitted outside the cir.while operation. Ideally, the
+  // cir.while operation should cover this cir.alloca operation, but currently
+  // CIR does not work this way. When hoisting such an alloca operation, one
+  // must remove the "const" flag from it, otherwise LLVM lowering code will
+  // mistakenly attach invariant group metadata to the load and store operations
+  // in the while body, indicating that all loads and stores across all
+  // iterations of the loop are constant.
+
+  for (mlir::Operation *user : alloca->getUsers()) {
+    if (!mlir::isa<cir::StoreOp>(user))
+      continue;
+
+    auto store = mlir::cast<cir::StoreOp>(user);
+    mlir::Operation *storeParentOp = store->getParentOp();
+    if (!mlir::isa<cir::WhileOp>(storeParentOp))
+      continue;
+
+    auto whileOp = mlir::cast<cir::WhileOp>(storeParentOp);
+    return &whileOp.getCond() == store->getParentRegion();
+  }
+
+  return false;
+}
+
+static void processConstAlloca(cir::AllocaOp alloca) {
+  // When optimization is enabled, LLVM lowering would start emitting invariant
+  // group metadata for loads and stores to alloca-ed objects with "const"
+  // attribute. For example, the following CIR:
+  //
+  //   %slot = cir.alloca !s32i [init, const]
+  //   cir.store %0, %slot
+  //   %1 = cir.load %slot
+  //
+  // would be lowered to the following LLVM IR:
+  //
+  //   %slot = alloca i32, i64 1
+  //   store i32 %0, ptr %slot, !invariant.group !0
+  //   %1 = load i32, ptr %slot, !invariant.group !0
+  //
+  // The invariant group metadata would tell LLVM optimizer that the store and
+  // load instruction would store and load the same value from %slot.
+  //
+  // So far so good. Things started to get tricky when such an alloca operation
+  // appears in the body of a loop construct:
+  //
+  //   cir.some_loop_construct {
+  //     %slot = cir.alloca !s32i [init, const]
+  //     cir.store %0, %slot
+  //     %1 = cir.load %slot
+  //   }
+  //
+  // After alloca hoisting, the CIR code above would be transformed into:
+  //
+  //   %slot = cir.alloca !s32i [init, const]
+  //   cir.some_loop_construct {
+  //     cir.store %0, %slot
+  //     %1 = cir.load %slot
+  //   }
+  //
+  // Notice how alloca hoisting change the semantics of the program in such a
+  // case. The transformed code now indicates the optimizer that the load and
+  // store operations load and store the same value **across all iterations of
+  // the loop**!
+  //
+  // To overcome this problem, we instead transform the program into this:
+  //
+  //   %slot = cir.alloca !s32i [init, const]
+  //   cir.some_loop_construct {
+  //     %slot.inv = cir.invariant_group %slot
+  //     cir.store %0, %slot.inv
+  //     %1 = cir.load %slot.inv
+  //   }
+  //
+  // The cir.invariant_group operation attaches fresh invariant information to
+  // the operand pointer and yields a pointer with the fresh invariant
+  // information. Upon each loop iteration, the old invariant information is
+  // disgarded, and a new invariant information is attached, thus the correct
+  // program semantic retains. During LLVM lowering, the cir.invariant_group
+  // operation would eventually become an intrinsic call to
+  // @llvm.launder.invariant.group.
+
+  if (isOpInLoop(alloca)) {
+    // Mark the alloca-ed pointer as invariant via the cir.invariant_group
+    // operation.
+    mlir::OpBuilder builder(alloca);
+    auto invariantGroupOp =
+        builder.create<cir::InvariantGroupOp>(alloca.getLoc(), alloca);
+
+    // And replace all uses of the original alloca-ed pointer with the marked
+    // pointer (which carries invariant group information).
+    alloca->replaceUsesWithIf(
+        invariantGroupOp,
+        [op = invariantGroupOp.getOperation()](mlir::OpOperand &use) {
+          return use.getOwner() != op;
+        });
+  } else if (hasStoreToAllocaInWhileCond(alloca)) {
+    // The alloca represents a variable declared as the condition of a while
+    // loop. In CIR, the alloca would be emitted at a scope outside of the
+    // while loop. We have to remove the constant flag during hoisting,
+    // otherwise we would be telling the optimizer that the alloca-ed value
+    // is constant across all iterations of the while loop.
+    //
+    // See the body of the isWhileCondition function for more details.
+    alloca.setConstant(false);
+  }
+}
+
+static void process(mlir::ModuleOp mod, cir::FuncOp func) {
   if (func.getRegion().empty())
     return;
 
@@ -47,25 +182,35 @@ static void process(cir::FuncOp func) {
     return;
 
   mlir::Operation *insertPoint = &*entryBlock.begin();
+  auto optInfoAttr = mlir::cast_if_present<cir::OptInfoAttr>(
+      mod->getAttr(cir::CIRDialect::getOptInfoAttrName()));
+  unsigned optLevel = optInfoAttr ? optInfoAttr.getLevel() : 0;
 
   for (auto alloca : allocas) {
-    alloca->moveBefore(insertPoint);
     if (alloca.getConstant()) {
-      // Hoisted alloca may come from the body of a loop, in which case the
-      // stack slot is re-used by multiple objects alive in different iterations
-      // of the loop. In theory, each of these objects are still constant within
-      // their lifetimes, but currently we're not emitting metadata to further
-      // describe this. So for now let's behave conservatively and remove the
-      // const flag on nested allocas when hoisting them.
-      alloca.setConstant(false);
+      if (optLevel == 0) {
+        // Under non-optimized builds, just remove the constant flag.
+        alloca.setConstant(false);
+        continue;
+      }
+
+      processConstAlloca(alloca);
     }
+
+    alloca->moveBefore(insertPoint);
   }
 }
 
 void HoistAllocasPass::runOnOperation() {
   llvm::TimeTraceScope scope("Hoist Allocas");
   llvm::SmallVector<Operation *, 16> ops;
-  getOperation()->walk([&](cir::FuncOp op) { process(op); });
+
+  Operation *op = getOperation();
+  auto mod = mlir::dyn_cast<mlir::ModuleOp>(op);
+  if (!mod)
+    mod = op->getParentOfType<mlir::ModuleOp>();
+
+  getOperation()->walk([&](cir::FuncOp op) { process(mod, op); });
 }
 
 } // namespace
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index 3ac4de81422b..3b9f1def6db8 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -1612,6 +1612,15 @@ getLLVMMemOrder(std::optional<cir::MemOrder> &memorder) {
   llvm_unreachable("unknown memory order");
 }
 
+static bool isLoadOrStoreInvariant(mlir::Value addr) {
+  if (auto addrAllocaOp =
+          mlir::dyn_cast_if_present<cir::AllocaOp>(addr.getDefiningOp()))
+    return addrAllocaOp.getConstant();
+  if (mlir::isa_and_present<cir::InvariantGroupOp>(addr.getDefiningOp()))
+    return true;
+  return false;
+}
+
 mlir::LogicalResult CIRToLLVMLoadOpLowering::matchAndRewrite(
     cir::LoadOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
@@ -1631,12 +1640,8 @@ mlir::LogicalResult CIRToLLVMLoadOpLowering::matchAndRewrite(
   auto invariant = false;
   // Under -O1 or higher optimization levels, add the invariant metadata if the
   // load operation loads from a constant object.
-  if (lowerMod &&
-      lowerMod->getContext().getCodeGenOpts().OptimizationLevel > 0) {
-    auto addrAllocaOp =
-        mlir::dyn_cast_if_present<cir::AllocaOp>(op.getAddr().getDefiningOp());
-    invariant = addrAllocaOp && addrAllocaOp.getConstant();
-  }
+  if (lowerMod && lowerMod->getContext().getCodeGenOpts().OptimizationLevel > 0)
+    invariant = isLoadOrStoreInvariant(op.getAddr());
 
   // TODO: nontemporal, syncscope.
   auto newLoad = rewriter.create<mlir::LLVM::LoadOp>(
@@ -1674,12 +1679,8 @@ mlir::LogicalResult CIRToLLVMStoreOpLowering::matchAndRewrite(
   auto invariant = false;
   // Under -O1 or higher optimization levels, add the invariant metadata if the
   // store operation stores to a constant object.
-  if (lowerMod &&
-      lowerMod->getContext().getCodeGenOpts().OptimizationLevel > 0) {
-    auto addrAllocaOp =
-        mlir::dyn_cast_if_present<cir::AllocaOp>(op.getAddr().getDefiningOp());
-    invariant = addrAllocaOp && addrAllocaOp.getConstant();
-  }
+  if (lowerMod && lowerMod->getContext().getCodeGenOpts().OptimizationLevel > 0)
+    invariant = isLoadOrStoreInvariant(op.getAddr());
 
   // Convert adapted value to its memory type if needed.
   mlir::Value value = emitToMemory(rewriter, dataLayout,
@@ -3666,6 +3667,20 @@ mlir::LogicalResult CIRToLLVMInlineAsmOpLowering::matchAndRewrite(
   return mlir::success();
 }
 
+mlir::LogicalResult CIRToLLVMInvariantGroupOpLowering::matchAndRewrite(
+    cir::InvariantGroupOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  if (!lowerMod ||
+      lowerMod->getContext().getCodeGenOpts().OptimizationLevel == 0) {
+    rewriter.replaceOp(op, adaptor.getPtr());
+    return mlir::success();
+  }
+
+  rewriter.replaceOpWithNewOp<mlir::LLVM::LaunderInvariantGroupOp>(
+      op, adaptor.getPtr());
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMPrefetchOpLowering::matchAndRewrite(
     cir::PrefetchOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
@@ -4107,7 +4122,8 @@ void populateCIRToLLVMConversionPatterns(
       CIRToLLVMBaseDataMemberOpLowering,
       CIRToLLVMCmpOpLowering,
       CIRToLLVMDerivedDataMemberOpLowering,
-      CIRToLLVMGetRuntimeMemberOpLowering
+      CIRToLLVMGetRuntimeMemberOpLowering,
+      CIRToLLVMInvariantGroupOpLowering
       // clang-format on
       >(converter, patterns.getContext(), lowerModule);
   patterns.add<
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
index f5441c7d11ac..5aafd1a2ecab 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h
@@ -975,6 +975,21 @@ class CIRToLLVMInlineAsmOpLowering
                   mlir::ConversionPatternRewriter &) const override;
 };
 
+class CIRToLLVMInvariantGroupOpLowering
+    : public mlir::OpConversionPattern<cir::InvariantGroupOp> {
+  cir::LowerModule *lowerMod;
+
+public:
+  CIRToLLVMInvariantGroupOpLowering(const mlir::TypeConverter &typeConverter,
+                                    mlir::MLIRContext *context,
+                                    cir::LowerModule *lowerModule)
+      : OpConversionPattern(typeConverter, context), lowerMod(lowerModule) {}
+
+  mlir::LogicalResult
+  matchAndRewrite(cir::InvariantGroupOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &) const override;
+};
+
 class CIRToLLVMPrefetchOpLowering
     : public mlir::OpConversionPattern<cir::PrefetchOp> {
 public:
diff --git a/clang/test/CIR/CodeGen/const-alloca.cpp b/clang/test/CIR/CodeGen/const-alloca.cpp
index 7cc9a5b57517..cd64a91ecf5d 100644
--- a/clang/test/CIR/CodeGen/const-alloca.cpp
+++ b/clang/test/CIR/CodeGen/const-alloca.cpp
@@ -5,6 +5,7 @@
 
 int produce_int();
 void blackbox(const int &);
+void consume(int);
 
 void local_const_int() {
   const int x = produce_int();
@@ -85,3 +86,87 @@ int local_const_optimize() {
 // LLVM-NEXT:    call void @_Z8blackboxRKi(ptr nonnull %[[#slot]])
 // LLVM-NEXT:    ret i32 %[[#init]]
 // LLVM-NEXT:  }
+
+int local_scoped_const() {
+  {
+    const int x = produce_int();
+    blackbox(x);
+    return x;
+  }
+}
+
+// CIR-LABEL: @_Z18local_scoped_constv()
+//      CIR:    cir.scope {
+// CIR-NEXT:      %[[#x_slot:]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x", init, const]
+// CIR-NEXT:      %[[#init:]] = cir.call @_Z11produce_intv() : () -> !s32i
+// CIR-NEXT:      cir.store %[[#init]], %[[#x_slot]] : !s32i, !cir.ptr<!s32i>
+// CIR-NEXT:      cir.call @_Z8blackboxRKi(%[[#x_slot]]) : (!cir.ptr<!s32i>) -> ()
+// CIR-NEXT:      %[[#x_reload:]] = cir.load %[[#x_slot]] : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:      cir.store %[[#x_reload]], %[[#ret_slot:]] : !s32i, !cir.ptr<!s32i>
+// CIR-NEXT:      %[[#ret:]] = cir.load %[[#ret_slot]] : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:      cir.return %[[#ret]] : !s32i
+// CIR-NEXT:    }
+//      CIR:  }
+
+// LLVM-LABEL: @_Z18local_scoped_constv()
+// LLVM-NEXT:    %[[#x_slot:]] = alloca i32, align 4
+// LLVM-NEXT:    %[[#init:]] = tail call i32 @_Z11produce_intv()
+// LLVM-NEXT:    store i32 %[[#init]], ptr %[[#x_slot]], align 4, !tbaa !{{.+}}, !invariant.group !{{.+}}
+// LLVM-NEXT:    call void @_Z8blackboxRKi(ptr nonnull %[[#x_slot]])
+// LLVM-NEXT:    ret i32 %[[#init]]
+// LLVM-NEXT:  }
+
+void local_const_in_loop() {
+  for (int i = 0; i < 10; ++i) {
+    const int x = produce_int();
+    blackbox(x);
+    consume(x);
+  }
+}
+
+// CIR-LABEL: @_Z19local_const_in_loopv
+//      CIR:    cir.scope {
+//      CIR:      cir.for : cond {
+//      CIR:      } body {
+// CIR-NEXT:        cir.scope {
+// CIR-NEXT:          %[[#x_slot:]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x", init, const]
+// CIR-NEXT:          %[[#init:]] = cir.call @_Z11produce_intv() : () -> !s32i
+// CIR-NEXT:          cir.store %[[#init]], %[[#x_slot]] : !s32i, !cir.ptr<!s32i>
+// CIR-NEXT:          cir.call @_Z8blackboxRKi(%[[#x_slot]]) : (!cir.ptr<!s32i>) -> ()
+// CIR-NEXT:          %[[#x_reload:]] = cir.load %[[#x_slot]] : !cir.ptr<!s32i>, !s32i
+// CIR-NEXT:          cir.call @_Z7consumei(%[[#x_reload]]) : (!s32i) -> ()
+// CIR-NEXT:        }
+// CIR-NEXT:        cir.yield
+// CIR-NEXT:      } step {
+//      CIR:      }
+// CIR-NEXT:    }
+// CIR-NEXT:    cir.return
+// CIR-NEXT:  }
+
+// LLVM-LABEL: @_Z19local_const_in_loopv()
+//      LLVM:    %[[#x_ptr:]] = call ptr @llvm.launder.invariant.group.p0(ptr nonnull %1)
+// LLVM-NEXT:    %[[#init:]] = call i32 @_Z11produce_intv()
+// LLVM-NEXT:    store i32 %[[#init]], ptr %[[#x_ptr]], align 4, !tbaa !{{.+}}, !invariant.group !{{.+}}
+// LLVM-NEXT:    call void @_Z8blackboxRKi(ptr nonnull %[[#x_ptr]])
+// LLVM-NEXT:    call void @_Z7consumei(i32 %[[#init]])
+//      LLVM:  }
+
+void local_const_in_while_condition() {
+  while (const int x = produce_int()) {
+    blackbox(x);
+  }
+}
+
+// LLVM-LABEL: @_Z30local_const_in_while_conditionv()
+//      LLVM:    %[[#x_slot:]] = alloca i32, align 4
+// LLVM-NEXT:    %[[#init:]] = tail call i32 @_Z11produce_intv()
+// LLVM-NEXT:    store i32 %[[#init]], ptr %[[#x_slot]], align 4
+// LLVM-NEXT:    %[[loop_cond:.+]] = icmp eq i32 %[[#init]], 0
+// LLVM-NEXT:    br i1 %[[loop_cond]], label %{{.+}}, label %[[loop_body:.+]]
+//      LLVM:  [[loop_body]]:
+// LLVM-NEXT:    call void @_Z8blackboxRKi(ptr nonnull %[[#x_slot]])
+// LLVM-NEXT:    %[[#next:]] = call i32 @_Z11produce_intv()
+// LLVM-NEXT:    store i32 %[[#next]], ptr %[[#x_slot]], align 4
+// LLVM-NEXT:    %[[cond:.+]] = icmp eq i32 %[[#next]], 0
+// LLVM-NEXT:    br i1 %[[cond]], label %{{.+}}, label %[[loop_body]]
+//      LLVM:  }