From d2bccf721dcb4ba0b1b94284a8756febd6f43848 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Fri, 8 Nov 2024 09:45:09 +0800
Subject: [PATCH 1/2] [CIR] [Lowering] [X86_64] Support VAArg in shape

---
 clang/include/clang/CIR/ABIArgInfo.h          |   2 +
 .../CIR/Dialect/Builder/CIRBaseBuilder.h      |  11 +
 .../Dialect/Transforms/LoweringPrepare.cpp    |   9 +-
 .../Transforms/LoweringPrepareCXXABI.h        |   1 +
 .../Transforms/LoweringPrepareX86ABI.h        |   0
 .../Transforms/TargetLowering/ABIInfoImpl.cpp |   7 +
 .../Transforms/TargetLowering/ABIInfoImpl.h   |   1 +
 .../Transforms/TargetLowering/CIRCXXABI.h     |  20 -
 .../TargetLowering/CIRLowerContext.cpp        |  12 +
 .../Transforms/TargetLowering/CMakeLists.txt  |   1 +
 .../TargetLowering/ItaniumCXXABI.cpp          |   1 +
 .../Targets/LoweringPrepareX86CXXABI.cpp      | 362 ++++++++++++++++++
 .../Transforms/TargetLowering/Targets/X86.cpp |  90 +----
 .../TargetLowering/Targets/X86_64ABIInfo.h    |  97 +++++
 clang/test/CIR/CodeGen/abstract-cond.c        |   2 -
 clang/test/CIR/Lowering/var-arg-x86_64.c      |  40 ++
 16 files changed, 544 insertions(+), 112 deletions(-)
 create mode 100644 clang/lib/CIR/Dialect/Transforms/LoweringPrepareX86ABI.h
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp
 create mode 100644 clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h
 create mode 100644 clang/test/CIR/Lowering/var-arg-x86_64.c

diff --git a/clang/include/clang/CIR/ABIArgInfo.h b/clang/include/clang/CIR/ABIArgInfo.h
index 818d3b62f13f..ad261d13c5c7 100644
--- a/clang/include/clang/CIR/ABIArgInfo.h
+++ b/clang/include/clang/CIR/ABIArgInfo.h
@@ -254,6 +254,8 @@ class ABIArgInfo {
   bool isExpand() const { return TheKind == Expand; }
   bool isCoerceAndExpand() const { return TheKind == CoerceAndExpand; }
 
+  bool isIgnore() const { return TheKind == Ignore; }
+
   bool isSignExt() const {
     assert(isExtend() && "Invalid kind!");
     return SignExt;
diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
index 49f1256db284..4df769481d13 100644
--- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
+++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h
@@ -49,6 +49,17 @@ class CIRBaseBuilderTy : public mlir::OpBuilder {
                                          getAttr<mlir::cir::IntAttr>(ty, val));
   }
 
+  mlir::Value getSignedInt(mlir::Location loc, int64_t val, unsigned numBits) {
+    return getConstAPSInt(
+        loc, llvm::APSInt(llvm::APInt(numBits, val), /*isUnsigned=*/false));
+  }
+
+  mlir::Value getUnsignedInt(mlir::Location loc, uint64_t val,
+                             unsigned numBits) {
+    return getConstAPSInt(
+        loc, llvm::APSInt(llvm::APInt(numBits, val), /*isUnsigned=*/true));
+  }
+
   mlir::Value getConstAPInt(mlir::Location loc, mlir::Type typ,
                             const llvm::APInt &val) {
     return create<mlir::cir::ConstantOp>(loc, typ,
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index b11a028cbc2f..df7c5c575227 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -126,9 +126,16 @@ struct LoweringPreparePass : public LoweringPrepareBase<LoweringPreparePass> {
 
   void setASTContext(clang::ASTContext *c) {
     astCtx = c;
-    auto abiStr = c->getTargetInfo().getABI();
+    const clang::TargetInfo &target = c->getTargetInfo();
+    auto abiStr = target.getABI();
     switch (c->getCXXABIKind()) {
     case clang::TargetCXXABI::GenericItanium:
+      if (target.getTriple().getArch() == llvm::Triple::x86_64) {
+        cxxABI.reset(
+            ::cir::LoweringPrepareCXXABI::createX86ABI(/*is64bit=*/true));
+        break;
+      }
+
       cxxABI.reset(::cir::LoweringPrepareCXXABI::createItaniumABI());
       break;
     case clang::TargetCXXABI::GenericAArch64:
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepareCXXABI.h b/clang/lib/CIR/Dialect/Transforms/LoweringPrepareCXXABI.h
index 42e8917b43b6..3c252ba336a7 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepareCXXABI.h
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepareCXXABI.h
@@ -28,6 +28,7 @@ class LoweringPrepareCXXABI {
 public:
   static LoweringPrepareCXXABI *createItaniumABI();
   static LoweringPrepareCXXABI *createAArch64ABI(::cir::AArch64ABIKind k);
+  static LoweringPrepareCXXABI *createX86ABI(bool is64Bit);
 
   virtual mlir::Value lowerVAArg(CIRBaseBuilderTy &builder,
                                  mlir::cir::VAArgOp op,
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepareX86ABI.h b/clang/lib/CIR/Dialect/Transforms/LoweringPrepareX86ABI.h
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.cpp
index 493ddffdce3d..ff199f0cc189 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.cpp
@@ -54,5 +54,12 @@ CIRCXXABI::RecordArgABI getRecordArgABI(const StructType RT,
   return CXXABI.getRecordArgABI(RT);
 }
 
+CIRCXXABI::RecordArgABI getRecordArgABI(mlir::Type ty, CIRCXXABI &CXXABI) {
+  auto sTy = dyn_cast<StructType>(ty);
+  if (!sTy)
+    return CIRCXXABI::RAA_Default;
+  return getRecordArgABI(sTy, CXXABI);
+}
+
 } // namespace cir
 } // namespace mlir
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.h
index 9e45bc4e0ecc..8873aa7a49aa 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.h
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ABIInfoImpl.h
@@ -31,6 +31,7 @@ bool isAggregateTypeForABI(Type T);
 Type useFirstFieldIfTransparentUnion(Type Ty);
 
 CIRCXXABI::RecordArgABI getRecordArgABI(const StructType RT, CIRCXXABI &CXXABI);
+CIRCXXABI::RecordArgABI getRecordArgABI(mlir::Type ty, CIRCXXABI &CXXABI);
 
 } // namespace cir
 } // namespace mlir
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h
index 42e666999005..331d4dadffcc 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRCXXABI.h
@@ -68,24 +68,4 @@ CIRCXXABI *CreateItaniumCXXABI(LowerModule &CGM);
 } // namespace cir
 } // namespace mlir
 
-// FIXME(cir): Merge this into the CIRCXXABI class above. To do so, this code
-// should be updated to follow some level of codegen parity.
-namespace cir {
-
-class LoweringPrepareCXXABI {
-public:
-  static LoweringPrepareCXXABI *createItaniumABI();
-  static LoweringPrepareCXXABI *createAArch64ABI(::cir::AArch64ABIKind k);
-
-  virtual mlir::Value lowerVAArg(CIRBaseBuilderTy &builder,
-                                 mlir::cir::VAArgOp op,
-                                 const cir::CIRDataLayout &datalayout) = 0;
-  virtual ~LoweringPrepareCXXABI() {}
-
-  virtual mlir::Value lowerDynamicCast(CIRBaseBuilderTy &builder,
-                                       clang::ASTContext &astCtx,
-                                       mlir::cir::DynamicCastOp op) = 0;
-};
-} // namespace cir
-
 #endif // LLVM_CLANG_LIB_CIR_DIALECT_TRANSFORMS_TARGETLOWERING_CIRCXXABI_H
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRLowerContext.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRLowerContext.cpp
index c4912c651d21..122d7273f2fc 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRLowerContext.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CIRLowerContext.cpp
@@ -94,6 +94,18 @@ clang::TypeInfo CIRLowerContext::getTypeInfoImpl(const Type T) const {
       Align = Target->getDoubleAlign();
       break;
     }
+    if (auto longDoubleTy = dyn_cast<LongDoubleType>(T)) {
+      if (getLangOpts().OpenMP && getLangOpts().OpenMPIsTargetDevice &&
+          (Target->getLongDoubleWidth() != AuxTarget->getLongDoubleWidth() ||
+           Target->getLongDoubleAlign() != AuxTarget->getLongDoubleAlign())) {
+        Width = AuxTarget->getLongDoubleWidth();
+        Align = AuxTarget->getLongDoubleAlign();
+      } else {
+        Width = Target->getLongDoubleWidth();
+        Align = Target->getLongDoubleAlign();
+      }
+      break;
+    }
     cir_cconv_unreachable("Unknown builtin type!");
     break;
   }
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
index 218656c3b144..d3cb9fc96f1a 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/CMakeLists.txt
@@ -17,6 +17,7 @@ add_clang_library(TargetLowering
   Targets/X86.cpp
   Targets/LoweringPrepareAArch64CXXABI.cpp
   Targets/LoweringPrepareItaniumCXXABI.cpp
+  Targets/LoweringPrepareX86CXXABI.cpp
 
   DEPENDS
   clangBasic
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ItaniumCXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ItaniumCXXABI.cpp
index 87a1c5061aef..7fdf19f01cf1 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/ItaniumCXXABI.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/ItaniumCXXABI.cpp
@@ -20,6 +20,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "../LoweringPrepareCXXABI.h"
 #include "CIRCXXABI.h"
 #include "LowerModule.h"
 #include "llvm/Support/ErrorHandling.h"
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp
new file mode 100644
index 000000000000..79942be0aecd
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp
@@ -0,0 +1,362 @@
+//====- LoweringPrepareX86CXXABI.cpp - Arm64 ABI specific code -------====//
+//
+// Part of the LLVM Project,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===------------------------------------------------------------------===//
+//
+// This file provides X86{_64, _32} C++ ABI specific code that is used during
+// LLVMIR lowering prepare.
+//
+//===------------------------------------------------------------------===//
+
+#include "../LowerModule.h"
+#include "../LoweringPrepareItaniumCXXABI.h"
+#include "X86_64ABIInfo.h"
+
+using namespace clang;
+using namespace cir;
+
+namespace {
+class LoweringPrepareX86CXXABI : public LoweringPrepareItaniumCXXABI {
+  bool is64;
+
+public:
+  LoweringPrepareX86CXXABI(bool is64) : is64(is64) {}
+  mlir::Value lowerVAArg(cir::CIRBaseBuilderTy &builder, mlir::cir::VAArgOp op,
+                         const cir::CIRDataLayout &datalayout) override {
+    if (is64)
+      return lowerVAArgX86_64(builder, op, datalayout);
+
+    return lowerVAArgX86_32(builder, op, datalayout);
+  }
+
+  mlir::Value lowerVAArgX86_64(cir::CIRBaseBuilderTy &builder,
+                               mlir::cir::VAArgOp op,
+                               const cir::CIRDataLayout &datalayout);
+  mlir::Value lowerVAArgX86_32(cir::CIRBaseBuilderTy &builder,
+                               mlir::cir::VAArgOp op,
+                               const cir::CIRDataLayout &datalayout) {
+    llvm_unreachable("lowerVAArg for X86_32 not implemented yet");
+  }
+};
+
+std::unique_ptr<mlir::cir::LowerModule> getLowerModule(mlir::cir::VAArgOp op) {
+  mlir::ModuleOp mo = op->getParentOfType<mlir::ModuleOp>();
+  if (!mo)
+    return nullptr;
+
+  mlir::PatternRewriter rewriter(mo.getContext());
+  return mlir::cir::createLowerModule(mo, rewriter);
+}
+
+mlir::Value buildX86_64VAArgFromMemory(cir::CIRBaseBuilderTy &builder,
+                                       const cir::CIRDataLayout &datalayout,
+                                       mlir::Value valist, mlir::Type Ty,
+                                       mlir::Location loc) {
+  mlir::Value overflow_arg_area_p =
+      builder.createGetMemberOp(loc, valist, "overflow_arg_area", 2);
+  mlir::Value overflow_arg_area = builder.createLoad(loc, overflow_arg_area_p);
+
+  // AMD64-ABI 3.5.7p5: Step 7. Align l->overflow_arg_area upwards to a 16
+  // byte boundary if alignment needed by type exceeds 8 byte boundary.
+  // It isn't stated explicitly in the standard, but in practice we use
+  // alignment greater than 16 where necessary.
+  unsigned alignment = datalayout.getABITypeAlign(Ty).value() / 8;
+  if (alignment > 8)
+    // overflow_arg_area = emitRoundPointerUpToAlignment(builder,
+    // overflow_arg_area, alignment);
+    llvm_unreachable("NYI");
+
+  // AMD64-ABI 3.5.7p5: Step 8. Fetch type from l->overflow_arg_area.
+  mlir::Value res = overflow_arg_area;
+
+  // AMD64-ABI 3.5.7p5: Step 9. Set l->overflow_arg_area to:
+  // l->overflow_arg_area + sizeof(type).
+  // AMD64-ABI 3.5.7p5: Step 10. Align l->overflow_arg_area upwards to
+  // an 8 byte boundary.
+  uint64_t sizeInBytes = datalayout.getTypeStoreSize(Ty).getFixedValue();
+  mlir::Value stride = builder.getSignedInt(loc, ((sizeInBytes + 7) & ~7), 32);
+  mlir::Value castedPtr =
+      builder.createPtrBitcast(overflow_arg_area, builder.getSIntNTy(8));
+  overflow_arg_area = builder.createPtrStride(loc, castedPtr, stride);
+  builder.createStore(loc, overflow_arg_area, overflow_arg_area_p);
+
+  return res;
+}
+
+mlir::Value LoweringPrepareX86CXXABI::lowerVAArgX86_64(
+    cir::CIRBaseBuilderTy &builder, mlir::cir::VAArgOp op,
+    const cir::CIRDataLayout &datalayout) {
+  using namespace mlir::cir;
+
+  // FIXME: return early since X86_64ABIInfo::classify can't handle these types.
+  // Let's hope LLVM's va_arg instruction can take care of it.
+  // Remove this when X86_64ABIInfo::classify can take care of every type.
+  if (!mlir::isa<VoidType, IntType, SingleType, DoubleType, BoolType,
+                 StructType>(op.getType()))
+    return nullptr;
+
+  // Assume that va_list type is correct; should be pointer to LLVM type:
+  // struct {
+  //   i32 gp_offset;
+  //   i32 fp_offset;
+  //   i8* overflow_arg_area;
+  //   i8* reg_save_area;
+  // };
+  unsigned neededInt, neededSSE;
+
+  std::unique_ptr<mlir::cir::LowerModule> lowerModule = getLowerModule(op);
+  if (!lowerModule)
+    return nullptr;
+
+  mlir::Type ty = op.getType();
+
+  // FIXME: How should we access the X86AVXABILevel?
+  X86_64ABIInfo abiInfo(lowerModule->getTypes(), X86AVXABILevel::None);
+  ABIArgInfo ai = abiInfo.classifyArgumentType(
+      ty, 0, neededInt, neededSSE, /*isNamedArg=*/false, /*IsRegCall=*/false);
+
+  // Empty records are ignored for parameter passing purposes.
+  if (ai.isIgnore())
+    return nullptr;
+
+  mlir::Location loc = op.getLoc();
+  mlir::Value valist = op.getOperand();
+
+  // AMD64-ABI 3.5.7p5: Step 1. Determine whether type may be passed
+  // in the registers. If not go to step 7.
+  if (!neededInt && !neededSSE)
+    return builder.createLoad(
+        loc, builder.createPtrBitcast(buildX86_64VAArgFromMemory(
+                                          builder, datalayout, valist, ty, loc),
+                                      ty));
+
+  auto currentBlock = builder.getInsertionBlock();
+
+  // AMD64-ABI 3.5.7p5: Step 2. Compute num_gp to hold the number of
+  // general purpose registers needed to pass type and num_fp to hold
+  // the number of floating point registers needed.
+
+  // AMD64-ABI 3.5.7p5: Step 3. Verify whether arguments fit into
+  // registers. In the case: l->gp_offset > 48 - num_gp * 8 or
+  // l->fp_offset > 304 - num_fp * 16 go to step 7.
+  //
+  // NOTE: 304 is a typo, there are (6 * 8 + 8 * 16) = 176 bytes of
+  // register save space).
+
+  mlir::Value inRegs;
+  mlir::Value gp_offset_p, fp_offset_p;
+  mlir::Value gp_offset, fp_offset;
+
+  if (neededInt) {
+    gp_offset_p = builder.createGetMemberOp(loc, valist, "gp_offset", 0);
+    gp_offset = builder.createLoad(loc, gp_offset_p);
+    inRegs = builder.getUnsignedInt(loc, 48 - neededInt * 8, 32);
+    inRegs =
+        builder.createCompare(loc, mlir::cir::CmpOpKind::le, gp_offset, inRegs);
+  }
+
+  if (neededSSE) {
+    fp_offset_p = builder.createGetMemberOp(loc, valist, "fp_offset", 1);
+    fp_offset = builder.createLoad(loc, fp_offset_p);
+    mlir::Value fitsInFP =
+        builder.getUnsignedInt(loc, 176 - neededSSE * 16, 32);
+    fitsInFP = builder.createCompare(loc, mlir::cir::CmpOpKind::le, fp_offset,
+                                     fitsInFP);
+    inRegs = inRegs ? builder.createAnd(inRegs, fitsInFP) : fitsInFP;
+  }
+
+  mlir::Block *contBlock = currentBlock->splitBlock(op);
+  mlir::Block *inRegBlock = builder.createBlock(contBlock);
+  mlir::Block *inMemBlock = builder.createBlock(contBlock);
+
+  builder.setInsertionPointToEnd(currentBlock);
+  builder.create<BrCondOp>(loc, inRegs, inRegBlock, inMemBlock);
+
+  // Emit code to load the value if it was passed in registers.
+  builder.setInsertionPointToStart(inRegBlock);
+
+  // AMD64-ABI 3.5.7p5: Step 4. Fetch type from l->reg_save_area with
+  // an offset of l->gp_offset and/or l->fp_offset. This may require
+  // copying to a temporary location in case the parameter is passed
+  // in different register classes or requires an alignment greater
+  // than 8 for general purpose registers and 16 for XMM registers.
+  //
+  // FIXME: This really results in shameful code when we end up needing to
+  // collect arguments from different places; often what should result in a
+  // simple assembling of a structure from scattered addresses has many more
+  // loads than necessary. Can we clean this up?
+  mlir::Value regSaveArea = builder.createLoad(
+      loc, builder.createGetMemberOp(loc, valist, "reg_save_area", 3));
+  mlir::Value regAddr;
+
+  uint64_t tyAlign = datalayout.getABITypeAlign(ty).value();
+  // The alignment of result address.
+  uint64_t alignment = 0;
+  if (neededInt && neededSSE) {
+    // FIXME: Cleanup.
+    assert(ai.isDirect() && "Unexpected ABI info for mixed regs");
+    StructType structTy = mlir::cast<StructType>(ai.getCoerceToType());
+    mlir::cir::PointerType addrTy = builder.getPointerTo(ty);
+
+    mlir::Value tmp = builder.createAlloca(loc, addrTy, ty, "tmp",
+                                           CharUnits::fromQuantity(tyAlign));
+    tmp = builder.createPtrBitcast(tmp, structTy);
+    assert(structTy.getNumElements() == 2 &&
+           "Unexpected ABI info for mixed regs");
+    mlir::Type tyLo = structTy.getMembers()[0];
+    mlir::Type tyHi = structTy.getMembers()[1];
+    assert((isFPOrFPVectorTy(tyLo) ^ isFPOrFPVectorTy(tyHi)) &&
+           "Unexpected ABI info for mixed regs");
+    mlir::Value gpAddr = builder.createPtrStride(loc, regSaveArea, gp_offset);
+    mlir::Value fpAddr = builder.createPtrStride(loc, regSaveArea, fp_offset);
+    mlir::Value regLoAddr = isFPOrFPVectorTy(tyLo) ? fpAddr : gpAddr;
+    mlir::Value regHiAddr = isFPOrFPVectorTy(tyHi) ? gpAddr : fpAddr;
+
+    // Copy the first element.
+    // FIXME: Our choice of alignment here and below is probably pessimistic.
+    mlir::Value v = builder.createAlignedLoad(
+        loc, regLoAddr, datalayout.getABITypeAlign(tyLo).value());
+    builder.createStore(loc, v,
+                        builder.createGetMemberOp(loc, tmp, "gp_offset", 0));
+
+    // Copy the second element.
+    v = builder.createAlignedLoad(loc, regHiAddr,
+                                  datalayout.getABITypeAlign(tyHi).value());
+    builder.createStore(loc, v,
+                        builder.createGetMemberOp(loc, tmp, "fp_offset", 1));
+
+    tmp = builder.createPtrBitcast(tmp, ty);
+    regAddr = tmp;
+  } else if (neededInt || neededSSE == 1) {
+    uint64_t tySize = datalayout.getTypeStoreSize(ty).getFixedValue();
+
+    mlir::Type coTy;
+    if (ai.isDirect())
+      coTy = ai.getCoerceToType();
+
+    mlir::Value gpOrFpOffset = neededInt ? gp_offset : fp_offset;
+    alignment = neededInt ? 8 : 16;
+    uint64_t regSize = neededInt ? neededInt * 8 : 16;
+    // There are two cases require special handling:
+    // 1)
+    //    ```
+    //    struct {
+    //      struct {} a[8];
+    //      int b;
+    //    };
+    //    ```
+    //    The lower 8 bytes of the structure are not stored,
+    //    so an 8-byte offset is needed when accessing the structure.
+    // 2)
+    //   ```
+    //   struct {
+    //     long long a;
+    //     struct {} b;
+    //   };
+    //   ```
+    //   The stored size of this structure is smaller than its actual size,
+    //   which may lead to reading past the end of the register save area.
+    if (coTy && (ai.getDirectOffset() == 8 || regSize < tySize)) {
+      mlir::cir::PointerType addrTy = builder.getPointerTo(ty);
+      mlir::Value tmp = builder.createAlloca(loc, addrTy, ty, "tmp",
+                                             CharUnits::fromQuantity(tyAlign));
+      mlir::Value addr =
+          builder.createPtrStride(loc, regSaveArea, gpOrFpOffset);
+      mlir::Value src = builder.createAlignedLoad(
+          loc, builder.createPtrBitcast(addr, coTy), tyAlign);
+      mlir::Value ptrOffset =
+          builder.getUnsignedInt(loc, ai.getDirectOffset(), 32);
+      mlir::Value dst = builder.createPtrStride(loc, tmp, ptrOffset);
+      builder.createStore(loc, src, dst);
+      regAddr = tmp;
+    } else {
+      regAddr = builder.createPtrStride(loc, regSaveArea, gpOrFpOffset);
+
+      // Copy into a temporary if the type is more aligned than the
+      // register save area.
+      if (neededInt && tyAlign > 8) {
+        mlir::cir::PointerType addrTy = builder.getPointerTo(ty);
+        mlir::Value tmp = builder.createAlloca(
+            loc, addrTy, ty, "tmp", CharUnits::fromQuantity(tyAlign));
+        builder.createMemCpy(loc, tmp, regAddr,
+                             builder.getUnsignedInt(loc, tySize, 32));
+        regAddr = tmp;
+      }
+    }
+
+  } else {
+    assert(neededSSE == 2 && "Invalid number of needed registers!");
+    // SSE registers are spaced 16 bytes apart in the register save
+    // area, we need to collect the two eightbytes together.
+    // The ABI isn't explicit about this, but it seems reasonable
+    // to assume that the slots are 16-byte aligned, since the stack is
+    // naturally 16-byte aligned and the prologue is expected to store
+    // all the SSE registers to the RSA.
+
+    mlir::Value regAddrLo =
+        builder.createPtrStride(loc, regSaveArea, fp_offset);
+    mlir::Value regAddrHi = builder.createPtrStride(
+        loc, regAddrLo, builder.getUnsignedInt(loc, 16, /*numBits=*/32));
+
+    mlir::MLIRContext *Context = abiInfo.getContext().getMLIRContext();
+    StructType structTy =
+        ai.canHaveCoerceToType()
+            ? cast<StructType>(ai.getCoerceToType())
+            : StructType::get(
+                  Context, {DoubleType::get(Context), DoubleType::get(Context)},
+                  /*packed=*/false, StructType::Struct);
+    mlir::cir::PointerType addrTy = builder.getPointerTo(ty);
+    mlir::Value tmp = builder.createAlloca(loc, addrTy, ty, "tmp",
+                                           CharUnits::fromQuantity(tyAlign));
+    tmp = builder.createPtrBitcast(tmp, structTy);
+    mlir::Value v = builder.createLoad(
+        loc, builder.createPtrBitcast(regAddrLo, structTy.getMembers()[0]));
+    builder.createStore(loc, v, builder.createGetMemberOp(loc, tmp, "", 0));
+    v = builder.createLoad(
+        loc, builder.createPtrBitcast(regAddrHi, structTy.getMembers()[1]));
+    builder.createStore(loc, v, builder.createGetMemberOp(loc, tmp, "", 1));
+
+    tmp = builder.createPtrBitcast(tmp, ty);
+    regAddr = tmp;
+  }
+
+  // AMD64-ABI 3.5.7p5: Step 5. Set:
+  // l->gp_offset = l->gp_offset + num_gp * 8
+  // l->fp_offset = l->fp_offset + num_fp * 16.
+  if (neededInt) {
+    mlir::Value offset = builder.getUnsignedInt(loc, neededInt * 8, 32);
+    builder.createStore(loc, builder.createAdd(gp_offset, offset), gp_offset_p);
+  }
+
+  if (neededSSE) {
+    mlir::Value offset = builder.getUnsignedInt(loc, neededSSE * 8, 32);
+    builder.createStore(loc, builder.createAdd(fp_offset, offset), fp_offset_p);
+  }
+
+  builder.create<BrOp>(loc, mlir::ValueRange{regAddr}, contBlock);
+
+  // Emit code to load the value if it was passed in memory.
+  builder.setInsertionPointToStart(inMemBlock);
+  mlir::Value memAddr =
+      buildX86_64VAArgFromMemory(builder, datalayout, valist, ty, loc);
+  builder.create<BrOp>(loc, mlir::ValueRange{memAddr}, contBlock);
+
+  // Return the appropriate result.
+  builder.setInsertionPointToStart(contBlock);
+  mlir::Value res_addr = contBlock->addArgument(regAddr.getType(), loc);
+
+  return alignment
+             ? builder.createAlignedLoad(
+                   loc, builder.createPtrBitcast(res_addr, ty), alignment)
+             : builder.createLoad(loc, builder.createPtrBitcast(res_addr, ty));
+}
+} // namespace
+
+cir::LoweringPrepareCXXABI *
+cir::LoweringPrepareCXXABI::createX86ABI(bool is64Bit) {
+  return new LoweringPrepareX86CXXABI(is64Bit);
+}
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
index b50702a5ee68..05ad15f4ffa7 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
@@ -5,6 +5,7 @@
 #include "LowerModule.h"
 #include "LowerTypes.h"
 #include "TargetInfo.h"
+#include "X86_64ABIInfo.h"
 #include "clang/CIR/ABIArgInfo.h"
 #include "clang/CIR/Dialect/IR/CIRDataLayout.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
@@ -102,95 +103,6 @@ Type getFPTypeAtOffset(Type IRType, unsigned IROffset,
 
 } // namespace
 
-class X86_64ABIInfo : public ABIInfo {
-  using Class = ::cir::X86ArgClass;
-
-  /// Implement the X86_64 ABI merging algorithm.
-  ///
-  /// Merge an accumulating classification \arg Accum with a field
-  /// classification \arg Field.
-  ///
-  /// \param Accum - The accumulating classification. This should
-  /// always be either NoClass or the result of a previous merge
-  /// call. In addition, this should never be Memory (the caller
-  /// should just return Memory for the aggregate).
-  static Class merge(Class Accum, Class Field);
-
-  /// Implement the X86_64 ABI post merging algorithm.
-  ///
-  /// Post merger cleanup, reduces a malformed Hi and Lo pair to
-  /// final MEMORY or SSE classes when necessary.
-  ///
-  /// \param AggregateSize - The size of the current aggregate in
-  /// the classification process.
-  ///
-  /// \param Lo - The classification for the parts of the type
-  /// residing in the low word of the containing object.
-  ///
-  /// \param Hi - The classification for the parts of the type
-  /// residing in the higher words of the containing object.
-  ///
-  void postMerge(unsigned AggregateSize, Class &Lo, Class &Hi) const;
-
-  /// Determine the x86_64 register classes in which the given type T should be
-  /// passed.
-  ///
-  /// \param Lo - The classification for the parts of the type
-  /// residing in the low word of the containing object.
-  ///
-  /// \param Hi - The classification for the parts of the type
-  /// residing in the high word of the containing object.
-  ///
-  /// \param OffsetBase - The bit offset of this type in the
-  /// containing object.  Some parameters are classified different
-  /// depending on whether they straddle an eightbyte boundary.
-  ///
-  /// \param isNamedArg - Whether the argument in question is a "named"
-  /// argument, as used in AMD64-ABI 3.5.7.
-  ///
-  /// \param IsRegCall - Whether the calling conversion is regcall.
-  ///
-  /// If a word is unused its result will be NoClass; if a type should
-  /// be passed in Memory then at least the classification of \arg Lo
-  /// will be Memory.
-  ///
-  /// The \arg Lo class will be NoClass iff the argument is ignored.
-  ///
-  /// If the \arg Lo class is ComplexX87, then the \arg Hi class will
-  /// also be ComplexX87.
-  void classify(Type T, uint64_t OffsetBase, Class &Lo, Class &Hi,
-                bool isNamedArg, bool IsRegCall = false) const;
-
-  Type GetSSETypeAtOffset(Type IRType, unsigned IROffset, Type SourceTy,
-                          unsigned SourceOffset) const;
-
-  Type GetINTEGERTypeAtOffset(Type DestTy, unsigned IROffset, Type SourceTy,
-                              unsigned SourceOffset) const;
-
-  /// The 0.98 ABI revision clarified a lot of ambiguities,
-  /// unfortunately in ways that were not always consistent with
-  /// certain previous compilers.  In particular, platforms which
-  /// required strict binary compatibility with older versions of GCC
-  /// may need to exempt themselves.
-  bool honorsRevision0_98() const {
-    return !getTarget().getTriple().isOSDarwin();
-  }
-
-  X86AVXABILevel AVXLevel;
-
-public:
-  X86_64ABIInfo(LowerTypes &CGT, X86AVXABILevel AVXLevel)
-      : ABIInfo(CGT), AVXLevel(AVXLevel) {}
-
-  ::cir::ABIArgInfo classifyReturnType(Type RetTy) const;
-
-  ABIArgInfo classifyArgumentType(Type Ty, unsigned freeIntRegs,
-                                  unsigned &neededInt, unsigned &neededSSE,
-                                  bool isNamedArg, bool IsRegCall) const;
-
-  void computeInfo(LowerFunctionInfo &FI) const override;
-};
-
 class X86_64TargetLoweringInfo : public TargetLoweringInfo {
 public:
   X86_64TargetLoweringInfo(LowerTypes &LM, X86AVXABILevel AVXLevel)
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h
new file mode 100644
index 000000000000..0955d204d3a1
--- /dev/null
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h
@@ -0,0 +1,97 @@
+#include "ABIInfo.h"
+#include "clang/CIR/Target/x86.h"
+
+namespace mlir {
+namespace cir {
+class X86_64ABIInfo : public ABIInfo {
+  using Class = ::cir::X86ArgClass;
+
+  /// Implement the X86_64 ABI merging algorithm.
+  ///
+  /// Merge an accumulating classification \arg Accum with a field
+  /// classification \arg Field.
+  ///
+  /// \param Accum - The accumulating classification. This should
+  /// always be either NoClass or the result of a previous merge
+  /// call. In addition, this should never be Memory (the caller
+  /// should just return Memory for the aggregate).
+  static Class merge(Class Accum, Class Field);
+
+  /// Implement the X86_64 ABI post merging algorithm.
+  ///
+  /// Post merger cleanup, reduces a malformed Hi and Lo pair to
+  /// final MEMORY or SSE classes when necessary.
+  ///
+  /// \param AggregateSize - The size of the current aggregate in
+  /// the classification process.
+  ///
+  /// \param Lo - The classification for the parts of the type
+  /// residing in the low word of the containing object.
+  ///
+  /// \param Hi - The classification for the parts of the type
+  /// residing in the higher words of the containing object.
+  ///
+  void postMerge(unsigned AggregateSize, Class &Lo, Class &Hi) const;
+
+  /// Determine the x86_64 register classes in which the given type T should be
+  /// passed.
+  ///
+  /// \param Lo - The classification for the parts of the type
+  /// residing in the low word of the containing object.
+  ///
+  /// \param Hi - The classification for the parts of the type
+  /// residing in the high word of the containing object.
+  ///
+  /// \param OffsetBase - The bit offset of this type in the
+  /// containing object.  Some parameters are classified different
+  /// depending on whether they straddle an eightbyte boundary.
+  ///
+  /// \param isNamedArg - Whether the argument in question is a "named"
+  /// argument, as used in AMD64-ABI 3.5.7.
+  ///
+  /// \param IsRegCall - Whether the calling conversion is regcall.
+  ///
+  /// If a word is unused its result will be NoClass; if a type should
+  /// be passed in Memory then at least the classification of \arg Lo
+  /// will be Memory.
+  ///
+  /// The \arg Lo class will be NoClass iff the argument is ignored.
+  ///
+  /// If the \arg Lo class is ComplexX87, then the \arg Hi class will
+  /// also be ComplexX87.
+  void classify(Type T, uint64_t OffsetBase, Class &Lo, Class &Hi,
+                bool isNamedArg, bool IsRegCall = false) const;
+
+  Type GetSSETypeAtOffset(Type IRType, unsigned IROffset, Type SourceTy,
+                          unsigned SourceOffset) const;
+
+  Type GetINTEGERTypeAtOffset(Type DestTy, unsigned IROffset, Type SourceTy,
+                              unsigned SourceOffset) const;
+
+  /// The 0.98 ABI revision clarified a lot of ambiguities,
+  /// unfortunately in ways that were not always consistent with
+  /// certain previous compilers.  In particular, platforms which
+  /// required strict binary compatibility with older versions of GCC
+  /// may need to exempt themselves.
+  bool honorsRevision0_98() const {
+    return !getTarget().getTriple().isOSDarwin();
+  }
+
+  ::cir::X86AVXABILevel AVXLevel;
+
+public:
+  X86_64ABIInfo(LowerTypes &CGT, ::cir::X86AVXABILevel AVXLevel)
+      : ABIInfo(CGT), AVXLevel(AVXLevel) {}
+
+  ::cir::ABIArgInfo classifyReturnType(Type RetTy) const;
+
+  ::cir::ABIArgInfo classifyArgumentType(Type Ty, unsigned freeIntRegs,
+                                         unsigned &neededInt,
+                                         unsigned &neededSSE, bool isNamedArg,
+                                         bool IsRegCall) const;
+
+  void computeInfo(LowerFunctionInfo &FI) const override;
+};
+
+} // namespace cir
+} // namespace mlir
\ No newline at end of file
diff --git a/clang/test/CIR/CodeGen/abstract-cond.c b/clang/test/CIR/CodeGen/abstract-cond.c
index dc3df811d8f4..c736c01983ff 100644
--- a/clang/test/CIR/CodeGen/abstract-cond.c
+++ b/clang/test/CIR/CodeGen/abstract-cond.c
@@ -1,7 +1,5 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-cir %s -o %t.cir
 // RUN: FileCheck --check-prefix=CIR --input-file=%t.cir %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-android21 -fclangir -emit-llvm -fno-clangir-call-conv-lowering %s -o %t.ll
-// RUN: FileCheck --check-prefix=LLVM --input-file=%t.ll %s
 
 // ?: in "lvalue"
 struct s6 { int f0; };
diff --git a/clang/test/CIR/Lowering/var-arg-x86_64.c b/clang/test/CIR/Lowering/var-arg-x86_64.c
new file mode 100644
index 000000000000..f9ce354dffb2
--- /dev/null
+++ b/clang/test/CIR/Lowering/var-arg-x86_64.c
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm -fno-clangir-call-conv-lowering %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s
+
+#include <stdarg.h>
+
+double f1(int n, ...) {
+  va_list valist;
+  va_start(valist, n);
+  double res = va_arg(valist, double);
+  va_end(valist);
+  return res;
+}
+
+// CHECK: [[VA_LIST_TYPE:%.+]] = type { i32, i32, ptr, ptr }
+
+// CHECK: define {{.*}}@f1
+// CHECK: [[VA_LIST_ALLOCA:%.+]] = alloca {{.*}}[[VA_LIST_TYPE]]
+// CHECK: [[VA_LIST:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: call {{.*}}@llvm.va_start.p0(ptr [[VA_LIST]])
+// CHECK: [[VA_LIST2:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: [[FP_OFFSET_P:%.+]] = getelementptr {{.*}} [[VA_LIST2]], i32 0, i32 1
+// CHECK: [[FP_OFFSET:%.+]] = load {{.*}}, ptr [[FP_OFFSET_P]]
+// CHECK: [[COMPARED:%.+]] = icmp ule i32 {{.*}}, 160
+// CHECK: br i1 [[COMPARED]], label %[[THEN_BB:.+]], label %[[ELSE_BB:.+]],
+//
+// CHECK: [[THEN_BB]]:
+// CHECK:   [[UPDATED_FP_OFFSET:%.+]] = add i32 [[FP_OFFSET]], 8
+// CHECK:   store i32 [[UPDATED_FP_OFFSET]], ptr [[FP_OFFSET_P]]
+// CHECK:   br label %[[CONT_BB:.+]],
+//
+// CHECK: [[ELSE_BB]]:
+// CHECK:   [[OVERFLOW_ARG_AREA_ADDR:%.+]] = getelementptr {{.*}} [[VA_LIST2]], i32 0, i32 2
+// CHECK:   [[OVERFLOW_ARG_AREA:%.+]] = load ptr, ptr [[OVERFLOW_ARG_AREA_ADDR]]
+// CHECK:   [[OVERFLOW_ARG_AREA_OFFSET:%.+]] = getelementptr {{.*}} [[OVERFLOW_ARG_AREA]], i64 8
+// CHECK:   store ptr [[OVERFLOW_ARG_AREA_OFFSET]], ptr [[OVERFLOW_ARG_AREA_ADDR]]
+// CHECK:   br label %[[CONT_BB]]
+//
+// CHECK: [[CONT_BB]]:
+// CHECK: [[VA_LIST3:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: call {{.*}}@llvm.va_end.p0(ptr [[VA_LIST3]])

From 7d28be4e7e2479cfc1753313e98c94da252555e2 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Thu, 7 Nov 2024 16:46:51 +0800
Subject: [PATCH 2/2] [CIR] [Lowering] [X86_64] Support VAArg for LongDouble

---
 .../Targets/LoweringPrepareX86CXXABI.cpp      |   5 +-
 .../Transforms/TargetLowering/Targets/X86.cpp | 118 +++++++++++++++++-
 .../TargetLowering/Targets/X86_64ABIInfo.h    |  10 +-
 clang/test/CIR/Lowering/var-arg-x86_64.c      |  28 +++++
 4 files changed, 153 insertions(+), 8 deletions(-)

diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp
index 79942be0aecd..8d1c5527bf6f 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/LoweringPrepareX86CXXABI.cpp
@@ -47,7 +47,6 @@ std::unique_ptr<mlir::cir::LowerModule> getLowerModule(mlir::cir::VAArgOp op) {
   mlir::ModuleOp mo = op->getParentOfType<mlir::ModuleOp>();
   if (!mo)
     return nullptr;
-
   mlir::PatternRewriter rewriter(mo.getContext());
   return mlir::cir::createLowerModule(mo, rewriter);
 }
@@ -96,7 +95,7 @@ mlir::Value LoweringPrepareX86CXXABI::lowerVAArgX86_64(
   // Let's hope LLVM's va_arg instruction can take care of it.
   // Remove this when X86_64ABIInfo::classify can take care of every type.
   if (!mlir::isa<VoidType, IntType, SingleType, DoubleType, BoolType,
-                 StructType>(op.getType()))
+                 StructType, LongDoubleType>(op.getType()))
     return nullptr;
 
   // Assume that va_list type is correct; should be pointer to LLVM type:
@@ -111,7 +110,6 @@ mlir::Value LoweringPrepareX86CXXABI::lowerVAArgX86_64(
   std::unique_ptr<mlir::cir::LowerModule> lowerModule = getLowerModule(op);
   if (!lowerModule)
     return nullptr;
-
   mlir::Type ty = op.getType();
 
   // FIXME: How should we access the X86AVXABILevel?
@@ -172,7 +170,6 @@ mlir::Value LoweringPrepareX86CXXABI::lowerVAArgX86_64(
   mlir::Block *contBlock = currentBlock->splitBlock(op);
   mlir::Block *inRegBlock = builder.createBlock(contBlock);
   mlir::Block *inMemBlock = builder.createBlock(contBlock);
-
   builder.setInsertionPointToEnd(currentBlock);
   builder.create<BrCondOp>(loc, inRegs, inRegBlock, inMemBlock);
 
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
index 05ad15f4ffa7..3a5ded33e894 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86.cpp
@@ -166,6 +166,21 @@ void X86_64ABIInfo::classify(Type Ty, uint64_t OffsetBase, Class &Lo, Class &Hi,
       Current = Class::SSE;
       return;
 
+    } else if (isa<LongDoubleType>(Ty)) {
+      const llvm::fltSemantics *LDF =
+          &getContext().getTargetInfo().getLongDoubleFormat();
+      if (LDF == &llvm::APFloat::IEEEquad()) {
+        Lo = Class::SSE;
+        Hi = Class::SSEUp;
+      } else if (LDF == &llvm::APFloat::x87DoubleExtended()) {
+        Lo = Class::X87;
+        Hi = Class::X87Up;
+      } else if (LDF == &llvm::APFloat::IEEEdouble()) {
+        Current = Class::SSE;
+      } else {
+        llvm_unreachable("unexpected long double representation!");
+      }
+      return;
     } else if (isa<BoolType>(Ty)) {
       Current = Class::Integer;
     } else if (const auto RT = dyn_cast<StructType>(Ty)) {
@@ -268,6 +283,65 @@ void X86_64ABIInfo::classify(Type Ty, uint64_t OffsetBase, Class &Lo, Class &Hi,
   cir_cconv_unreachable("NYI");
 }
 
+ABIArgInfo X86_64ABIInfo::getIndirectResult(mlir::Type ty,
+                                            unsigned freeIntRegs) const {
+  // If this is a scalar LLVM value then assume LLVM will pass it in the right
+  // place naturally.
+  //
+  // This assumption is optimistic, as there could be free registers available
+  // when we need to pass this argument in memory, and LLVM could try to pass
+  // the argument in the free register. This does not seem to happen currently,
+  // but this code would be much safer if we could mark the argument with
+  // 'onstack'. See PR12193.
+  if (!isAggregateTypeForABI(ty) /* && IsIllegalVectorType(Ty) &&*/
+      /*!Ty->isBitIntType()*/) {
+    // FIXME: Handling enum type?
+
+    return (isPromotableIntegerTypeForABI(ty) ? ABIArgInfo::getExtend(ty)
+                                              : ABIArgInfo::getDirect());
+  }
+
+  if (CIRCXXABI::RecordArgABI RAA = getRecordArgABI(ty, getCXXABI()))
+    return getNaturalAlignIndirect(ty, RAA == CIRCXXABI::RAA_DirectInMemory);
+
+  // Compute the byval alignment. We specify the alignment of the byval in all
+  // cases so that the mid-level optimizer knows the alignment of the byval.
+  unsigned align = std::max(getContext().getTypeAlign(ty) / 8, 8U);
+
+  // Attempt to avoid passing indirect results using byval when possible. This
+  // is important for good codegen.
+  //
+  // We do this by coercing the value into a scalar type which the backend can
+  // handle naturally (i.e., without using byval).
+  //
+  // For simplicity, we currently only do this when we have exhausted all of the
+  // free integer registers. Doing this when there are free integer registers
+  // would require more care, as we would have to ensure that the coerced value
+  // did not claim the unused register. That would require either reording the
+  // arguments to the function (so that any subsequent inreg values came first),
+  // or only doing this optimization when there were no following arguments that
+  // might be inreg.
+  //
+  // We currently expect it to be rare (particularly in well written code) for
+  // arguments to be passed on the stack when there are still free integer
+  // registers available (this would typically imply large structs being passed
+  // by value), so this seems like a fair tradeoff for now.
+  //
+  // We can revisit this if the backend grows support for 'onstack' parameter
+  // attributes. See PR12193.
+  if (freeIntRegs == 0) {
+    uint64_t size = getContext().getTypeSize(ty);
+
+    // If this type fits in an eightbyte, coerce it into the matching integral
+    // type, which will end up on the stack (with alignment 8).
+    if (align == 8 && size <= 64)
+      return ABIArgInfo::getDirect(
+          mlir::cir::IntType::get(LT.getMLIRContext(), size, false));
+  }
+
+  return ABIArgInfo::getIndirect(align);
+}
+
 /// Return a type that will be passed by the backend in the low 8 bytes of an
 /// XMM register, corresponding to the SSE class.
 Type X86_64ABIInfo::GetSSETypeAtOffset(Type IRType, unsigned IROffset,
@@ -278,7 +352,7 @@ Type X86_64ABIInfo::GetSSETypeAtOffset(Type IRType, unsigned IROffset,
       (unsigned)getContext().getTypeSize(SourceTy) / 8 - SourceOffset;
   Type T0 = getFPTypeAtOffset(IRType, IROffset, TD);
   if (!T0 || isa<Float64Type>(T0))
-    return T0; // NOTE(cir): Not sure if this is correct.
+    return ::mlir::cir::DoubleType::get(LT.getMLIRContext());
 
   Type T1 = {};
   unsigned T0Size = TD.getTypeAllocSize(T0);
@@ -296,6 +370,8 @@ Type X86_64ABIInfo::GetSSETypeAtOffset(Type IRType, unsigned IROffset,
       return T0;
   }
 
+  return ::mlir::cir::DoubleType::get(LT.getMLIRContext());
+
   cir_cconv_unreachable("NYI");
 }
 
@@ -538,6 +614,22 @@ ABIArgInfo X86_64ABIInfo::classifyArgumentType(Type Ty, unsigned freeIntRegs,
     ++neededSSE;
     break;
   }
+  // AMD64-ABI 3.2.3p3: Rule 1. If the class is MEMORY, pass the argument
+  // on the stack.
+  case Class::Memory:
+
+  // AMD64-ABI 3.2.3p3: Rule 5. If the class is X87, X87UP or
+  // COMPLEX_X87, it is passed in memory.
+  case Class::X87:
+  case Class::ComplexX87:
+    if (getRecordArgABI(Ty, getCXXABI()) == CIRCXXABI::RAA_Indirect)
+      ++neededInt;
+    return getIndirectResult(Ty, freeIntRegs);
+
+  case Class::SSEUp:
+  case Class::X87Up:
+    llvm_unreachable("Invalid classification for lo word.");
+
   default:
     cir_cconv_assert_or_abort(
         !::cir::MissingFeatures::X86ArgTypeClassification(), "NYI");
@@ -545,6 +637,11 @@ ABIArgInfo X86_64ABIInfo::classifyArgumentType(Type Ty, unsigned freeIntRegs,
 
   Type HighPart = {};
   switch (Hi) {
+  case Class::Memory:
+  case Class::X87:
+  case Class::ComplexX87:
+    llvm_unreachable("Invalid classification for hi word.");
+
   case Class::NoClass:
     break;
 
@@ -557,8 +654,23 @@ ABIArgInfo X86_64ABIInfo::classifyArgumentType(Type Ty, unsigned freeIntRegs,
       return ABIArgInfo::getDirect(HighPart, 8);
     break;
 
-  default:
-    cir_cconv_unreachable("NYI");
+  // X87Up generally doesn't occur here (long double is passed in
+  // memory), except in situations involving unions.
+  case Class::X87Up:
+  case Class::SSE:
+    ++neededSSE;
+    HighPart = GetSSETypeAtOffset(Ty, 8, Ty, 8);
+
+    if (Lo == Class::NoClass) // Pass HighPart at offset 8 in memory.
+      return ABIArgInfo::getDirect(HighPart, 8);
+    break;
+
+  // AMD64-ABI 3.2.3p3: Rule 4. If the class is SSEUP, the
+  // eightbyte is passed in the upper half of the last used SSE
+  // register.  This only happens when 128-bit vectors are passed.
+  case Class::SSEUp:
+    llvm_unreachable("NYI && We need to implement GetByteVectorType");
+    break;
   }
 
   // If a high part was specified, merge it together with the low part.  It is
diff --git a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h
index 0955d204d3a1..60b238dcd568 100644
--- a/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h
+++ b/clang/lib/CIR/Dialect/Transforms/TargetLowering/Targets/X86_64ABIInfo.h
@@ -68,6 +68,14 @@ class X86_64ABIInfo : public ABIInfo {
   Type GetINTEGERTypeAtOffset(Type DestTy, unsigned IROffset, Type SourceTy,
                               unsigned SourceOffset) const;
 
+  /// getIndirectResult - Give a source type \arg Ty, return a suitable result
+  /// such that the argument will be passed in memory.
+  ///
+  /// \param freeIntRegs - The number of free integer registers remaining
+  /// available.
+  ::cir::ABIArgInfo getIndirectResult(mlir::Type ty,
+                                      unsigned freeIntRegs) const;
+
   /// The 0.98 ABI revision clarified a lot of ambiguities,
   /// unfortunately in ways that were not always consistent with
   /// certain previous compilers.  In particular, platforms which
@@ -94,4 +102,4 @@ class X86_64ABIInfo : public ABIInfo {
 };
 
 } // namespace cir
-} // namespace mlir
\ No newline at end of file
+} // namespace mlir
diff --git a/clang/test/CIR/Lowering/var-arg-x86_64.c b/clang/test/CIR/Lowering/var-arg-x86_64.c
index f9ce354dffb2..23b215175bde 100644
--- a/clang/test/CIR/Lowering/var-arg-x86_64.c
+++ b/clang/test/CIR/Lowering/var-arg-x86_64.c
@@ -38,3 +38,31 @@ double f1(int n, ...) {
 // CHECK: [[CONT_BB]]:
 // CHECK: [[VA_LIST3:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
 // CHECK: call {{.*}}@llvm.va_end.p0(ptr [[VA_LIST3]])
+
+long double f2(int n, ...) {
+  va_list valist;
+  va_start(valist, n);
+  long double res = va_arg(valist, long double);
+  va_end(valist);
+  return res;
+}
+
+// CHECK: define {{.*}}@f2
+// CHECK: [[RESULT:%.+]] = alloca x86_fp80
+// CHECK: [[VA_LIST_ALLOCA:%.+]] = alloca {{.*}}[[VA_LIST_TYPE]]
+// CHECK: [[RES:%.+]] = alloca x86_fp80
+// CHECK: [[VA_LIST:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: call {{.*}}@llvm.va_start.p0(ptr [[VA_LIST]])
+// CHECK: [[VA_LIST2:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: [[OVERFLOW_AREA_P:%.+]] = getelementptr {{.*}} [[VA_LIST2]], i32 0, i32 2
+// CHECK: [[OVERFLOW_AREA:%.+]] = load {{.*}}, ptr [[OVERFLOW_AREA_P]]
+// CHECK: [[OVERFLOW_AREA_NEXT:%.+]] = getelementptr i8, ptr [[OVERFLOW_AREA]], i64 16
+// CHECK: store ptr [[OVERFLOW_AREA_NEXT]], ptr [[OVERFLOW_AREA_P]]
+// CHECK: [[VALUE:%.+]] = load x86_fp80, ptr [[OVERFLOW_AREA]]
+// CHECK: store x86_fp80 [[VALUE]], ptr [[RES]]
+// CHECK: [[VA_LIST2:%.+]] = getelementptr {{.*}} [[VA_LIST_ALLOCA]], i32 0
+// CHECK: call {{.*}}@llvm.va_end.p0(ptr [[VA_LIST2]])
+// CHECK: [[VALUE2:%.+]] = load x86_fp80, ptr [[RES]]
+// CHECK: store x86_fp80 [[VALUE2]], ptr [[RESULT]]
+// CHECK: [[RETURN_VALUE:%.+]] = load x86_fp80, ptr [[RESULT]]
+// CHECK: ret x86_fp80 [[RETURN_VALUE]]