[clang] [llvm] [WIP] Expand variadic functions in IR (PR #89007)

Wed Apr 17 03:35:19 PDT 2024

================
@@ -0,0 +1,1056 @@
+//===-- ExpandVariadicsPass.cpp --------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an optimization pass for variadic functions. If called from codegen,
+// it can serve as the implementation of variadic functions for a given target.
+//
+// The strategy is to turn the ... part of a varidic function into a va_list
+// and fix up the call sites. This is completely effective if the calling
+// convention can declare that to be the right thing, e.g. on GPUs or where
+// the application is wholly statically linked. In the usual case, it will
+// replace known calls to known variadic functions with calls that are amenable
+// to inlining and other optimisations.
+//
+// The target-dependent parts are in class VariadicABIInfo. Enabling a new
+// target means adding a case to VariadicABIInfo::create() along with tests.
+// This will be especially simple if the va_list representation is a char*.
+//
+// The majority of the plumbing is splitting the variadic function into a
+// single basic block that packs the variadic arguments into a va_list and
+// a second function that does the work of the original. The target specific
+// part is packing arguments into a contiguous buffer that the clang expansion
+// of va_arg will do the right thing with.
+//
+// The aggregate effect is to unblock other transforms, most critically the
+// general purpose inliner. Known calls to variadic functions become zero cost.
+//
+// Consistency with clang is primarily tested by emitting va_arg using clang
+// then expanding the variadic functions using this pass, followed by trying
+// to constant fold the functions to no-ops.
+//
+// Target specific behaviour is tested in IR - mainly checking that values are
+// put into positions in call frames that make sense for that particular target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/TargetParser/Triple.h"
+
+#include <cstdio>
+
+#define DEBUG_TYPE "expand-variadics"
+
+using namespace llvm;
+
+cl::opt<ExpandVariadicsMode> ExpandVariadicsModeOption(
+    DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE),
+    cl::init(ExpandVariadicsMode::unspecified),
+    cl::values(clEnumValN(ExpandVariadicsMode::unspecified, "unspecified",
+                          "Use the implementation defaults"),
+               clEnumValN(ExpandVariadicsMode::disable, "disable",
+                          "Disable the pass entirely"),
+               clEnumValN(ExpandVariadicsMode::optimize, "optimize",
+                          "Optimise without changing ABI"),
+               clEnumValN(ExpandVariadicsMode::lowering, "lowering",
+                          "Change variadic calling convention")));
+
+namespace {
+
+// Module implements getFunction() which returns nullptr on missing declaration
+// and getOrInsertFunction which creates one when absent. Intrinsics.h
+// implements getDeclaration which creates one when missing. This should be
+// changed to be consistent with Module()'s naming. Implementing as a local
+// function here in the meantime to decouple from that process.
+Function *getPreexistingDeclaration(Module *M, Intrinsic::ID id,
+                                    ArrayRef<Type *> Tys = std::nullopt) {
+  auto *FT = Intrinsic::getType(M->getContext(), id, Tys);
+  return M->getFunction(Tys.empty() ? Intrinsic::getName(id)
+                                    : Intrinsic::getName(id, Tys, M, FT));
+}
+
+// Lots of targets use a void* pointed at a buffer for va_list.
+// Some use more complicated iterator constructs. Type erase that
+// so the rest of the pass can operation on either.
+// Virtual functions where different targets want different behaviour,
+// normal where all implemented targets presently have the same.
+struct VAListInterface {
+  virtual ~VAListInterface() {}
+
+  // Whether a valist instance is passed by value or by address
+  // I.e. does it need to be alloca'ed and stored into, or can
+  // it be passed directly in a SSA register
+  virtual bool passedInSSARegister() = 0;
+
+  // The type of a va_list iterator object
+  virtual Type *vaListType(LLVMContext &Ctx) = 0;
+
+  // The type of a va_list as a function argument as lowered by C
+  virtual Type *vaListParameterType(Module &M) = 0;
+
+  // Initialise an allocated va_list object to point to an already
+  // initialised contiguous memory region.
+  // Return the value to pass as the va_list argument
+  virtual Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+                                  AllocaInst *, Value * /*buffer*/) = 0;
+
+  // Simple lowering suffices for va_end, va_copy for current targets
+  bool vaEndIsNop() { return true; }
+  bool vaCopyIsMemcpy() { return true; }
+};
+
+// The majority case - a void* into an alloca
+struct VoidPtr final : public VAListInterface {
+  bool passedInSSARegister() override { return true; }
+
+  Type *vaListType(LLVMContext &Ctx) override {
+    return PointerType::getUnqual(Ctx);
+  }
+
+  Type *vaListParameterType(Module &M) override {
+    return PointerType::getUnqual(M.getContext());
+  }
+
+  Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+                          AllocaInst * /*va_list*/, Value *buffer) override {
+    return buffer;
+  }
+};
+
+struct VoidPtrAllocaAddrspace final : public VAListInterface {
+
+  bool passedInSSARegister() override { return true; }
+
+  Type *vaListType(LLVMContext &Ctx) override {
+    return PointerType::getUnqual(Ctx);
+  }
+
+  Type *vaListParameterType(Module &M) override {
+    const DataLayout &DL = M.getDataLayout();
+    return DL.getAllocaPtrType(M.getContext());
+  }
+
+  Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+                          AllocaInst * /*va_list*/, Value *buffer) override {
+    return buffer;
+  }
+};
+
+// SystemV as used by X64 Linux and others
+struct SystemV final : public VAListInterface {
+  bool passedInSSARegister() override { return false; }
+
+  Type *vaListType(LLVMContext &Ctx) override {
+    auto I32 = Type::getInt32Ty(Ctx);
+    auto Ptr = PointerType::getUnqual(Ctx);
+    return ArrayType::get(StructType::get(Ctx, {I32, I32, Ptr, Ptr}), 1);
+  }
+
+  Type *vaListParameterType(Module &M) override {
+    return PointerType::getUnqual(M.getContext());
+  }
+
+  Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+                          AllocaInst *VaList, Value *VoidBuffer) override {
+    assert(VaList->getAllocatedType() == vaListType(Ctx));
+
+    Type *VaListTy = vaListType(Ctx);
+
+    Type *I32 = Type::getInt32Ty(Ctx);
+    Type *I64 = Type::getInt64Ty(Ctx);
+
+    Value *Idxs[3] = {
+        ConstantInt::get(I64, 0),
+        ConstantInt::get(I32, 0),
+        nullptr,
+    };
+
+    Idxs[2] = ConstantInt::get(I32, 0);
+    Builder.CreateStore(
+        ConstantInt::get(I32, 48),
+        Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "gp_offset"));
+
+    Idxs[2] = ConstantInt::get(I32, 1);
+    Builder.CreateStore(
+        ConstantInt::get(I32, 6 * 8 + 8 * 16),
+        Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "fp_offset"));
+
+    Idxs[2] = ConstantInt::get(I32, 2);
+    Builder.CreateStore(
+        VoidBuffer,
+        Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "overfow_arg_area"));
+
+    Idxs[2] = ConstantInt::get(I32, 3);
+    Builder.CreateStore(
+        ConstantPointerNull::get(PointerType::getUnqual(Ctx)),
+        Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "reg_save_area"));
+
+    return VaList;
+  }
+};
+
+class VariadicABIInfo {
+
+  VariadicABIInfo(uint32_t MinAlign, uint32_t MaxAlign,
+                  std::unique_ptr<VAListInterface> VAList)
+      : MinAlign(MinAlign), MaxAlign(MaxAlign), VAList(std::move(VAList)) {}
+
+  template <typename T>
+  static VariadicABIInfo create(uint32_t MinAlign, uint32_t MaxAlign) {
+    return {MinAlign, MaxAlign, std::make_unique<T>()};
+  }
+
+public:
+  const uint32_t MinAlign;
+  const uint32_t MaxAlign;
+  std::unique_ptr<VAListInterface> VAList;
+
+  VariadicABIInfo() : VariadicABIInfo(0, 0, nullptr) {}
+  explicit operator bool() const { return static_cast<bool>(VAList); }
+
+  VariadicABIInfo(VariadicABIInfo &&Self)
+      : MinAlign(Self.MinAlign), MaxAlign(Self.MaxAlign),
+        VAList(Self.VAList.release()) {}
+
+  VariadicABIInfo &operator=(VariadicABIInfo &&Other) {
+    this->~VariadicABIInfo();
+    new (this) VariadicABIInfo(std::move(Other));
+    return *this;
+  }
+
+  static VariadicABIInfo create(llvm::Triple const &Triple) {
+    const bool IsLinuxABI = Triple.isOSLinux() || Triple.isOSCygMing();
+
+    switch (Triple.getArch()) {
+
+    case Triple::r600:
+    case Triple::amdgcn: {
+      return create<VoidPtrAllocaAddrspace>(1, 0);
+    }
+
+    case Triple::nvptx:
+    case Triple::nvptx64: {
+      return create<VoidPtr>(4, 0);
+    }
+
+    case Triple::x86: {
+      // These seem to all fall out the same, despite getTypeStackAlign
+      // implying otherwise.
+
+      if (Triple.isOSDarwin()) {
+        // X86_32ABIInfo::getTypeStackAlignInBytes is misleading for this.
+        // The slotSize(4) implies a minimum alignment
+        // The AllowHigherAlign = true means there is no maximum alignment.
+
+        return create<VoidPtr>(4, 0);
+      }
+      if (Triple.getOS() == llvm::Triple::Win32) {
+        return create<VoidPtr>(4, 0);
+      }
+
+      if (IsLinuxABI) {
+        return create<VoidPtr>(4, 0);
+      }
+
+      break;
+    }
+
+    case Triple::x86_64: {
+      if (Triple.isWindowsMSVCEnvironment() || Triple.isOSWindows()) {
+        // x64 msvc emit vaarg passes > 8 byte values by pointer
+        // however the variadic call instruction created does not, e.g.
+        // a <4 x f32> will be passed as itself, not as a pointer or byval.
+        // Postponing resolution of that for now.
+        // Expected min/max align of 8.
+        return {};
+      }
+
+      // SystemV X64 documented behaviour:
+      // Slots are at least eight byte aligned and at most 16 byte aligned.
+      // If the type needs more than sixteen byte alignment, it still only gets
+      // that much alignment on the stack.
+      // X64 behaviour in clang:
+      // Slots are at least eight byte aligned and at most naturally aligned
+      // This matches clang, not the ABI docs.
+
+      if (Triple.isOSDarwin()) {
+        return create<SystemV>(8, 8);
+      }
+
+      if (IsLinuxABI) {
+        return create<SystemV>(8, 8);
+      }
+
+      break;
+    }
+
+    default:
+      break;
+    }
+
+    return {};
+  }
+};
+
+class ExpandVariadics : public ModulePass {
+
+  // The pass construction sets the default (optimize when called from middle
+  // end, lowering when called from the backend). The command line variable
+  // overrides that. This is useful for testing and debugging. It also allows
+  // building an applications with variadic functions wholly removed if one
+  // has sufficient control over the dependencies, e.g. a statically linked
+  // clang that has no variadic function calls remaining in the binary.
+  static ExpandVariadicsMode
+  withCommandLineOverride(ExpandVariadicsMode LLVMRequested) {
+    ExpandVariadicsMode UserRequested = ExpandVariadicsModeOption;
+    return (UserRequested == ExpandVariadicsMode::unspecified) ? LLVMRequested
+                                                               : UserRequested;
+  }
+
+public:
+  static char ID;
+  const ExpandVariadicsMode Mode;
+  VariadicABIInfo ABI;
+
+  ExpandVariadics(ExpandVariadicsMode Mode)
+      : ModulePass(ID), Mode(withCommandLineOverride(Mode)) {}
+  StringRef getPassName() const override { return "Expand variadic functions"; }
+
+  // Rewrite a variadic call site
+  bool expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, FunctionType *,
+                  Function *NF);
+
+  // Given a variadic function, return a function taking a va_list that can be
+  // called instead of the original. Mutates F.
+  Function *deriveInlinableVariadicFunctionPair(Module &M, IRBuilder<> &Builder,
+                                                Function &F);
+
+  bool runOnFunction(Module &M, IRBuilder<> &Builder, Function *F);
+
+  // Entry point
+  bool runOnModule(Module &M) override;
+
+  bool rewriteABI() { return Mode == ExpandVariadicsMode::lowering; }
+
+  void memcpyVAListPointers(const DataLayout &DL, IRBuilder<> &Builder,
+                            Value *Dst, Value *Src) {
+    auto &Ctx = Builder.getContext();
+    Type *VaListTy = ABI.VAList->vaListType(Ctx);
+    uint64_t Size = DL.getTypeAllocSize(VaListTy).getFixedValue();
+    // todo: on amdgcn this should be in terms of addrspace 5
+    Builder.CreateMemCpyInline(Dst, {}, Src, {},
+                               ConstantInt::get(Type::getInt32Ty(Ctx), Size));
+  }
+
+  bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL,
+                             VAStartInst *Inst);
+
+  bool expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &,
+                             VAEndInst *Inst);
+
+  bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL,
+                             VACopyInst *Inst);
+
+  template <Intrinsic::ID ID, typename InstructionType>
+  bool expandIntrinsicUsers(Module &M, IRBuilder<> &Builder,
+                            PointerType *ArgType) {
+    bool Changed = false;
+    const DataLayout &DL = M.getDataLayout();
+    if (Function *Intrinsic = getPreexistingDeclaration(&M, ID, {ArgType})) {
+      for (User *U : Intrinsic->users()) {
+        if (auto *I = dyn_cast<InstructionType>(U)) {
+          Changed |= expandVAIntrinsicCall(Builder, DL, I);
+        }
+      }
+      if (Intrinsic->use_empty())
+        Intrinsic->eraseFromParent();
+    }
+    return Changed;
+  }
+
+  FunctionType *inlinableVariadicFunctionType(Module &M, FunctionType *FTy) {
+    SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end());
+    ArgTypes.push_back(ABI.VAList->vaListParameterType(M));
+    return FunctionType::get(FTy->getReturnType(), ArgTypes,
+                             /*IsVarArgs*/ false);
+  }
+
+  static ConstantInt *sizeOfAlloca(LLVMContext &Ctx, const DataLayout &DL,
+                                   AllocaInst *Alloced) {
+    Type *AllocaType = Alloced->getAllocatedType();
+    TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
+    uint64_t AsInt = AllocaTypeSize.getFixedValue();
+    return ConstantInt::get(Type::getInt64Ty(Ctx), AsInt);
+  }
+
+  static SmallSet<unsigned, 2> supportedAddressSpaces(const DataLayout &DL) {
+    // FIXME: It looks like a module can contain arbitrary integers for address
+    // spaces in which case we might need to check _lots_ of cases. Maybe add a
+    // rule to the verifier that the vastart/vaend intrinsics can have arguments
+    // in 0 or in allocaaddrspace but nowhere else
+    SmallSet<unsigned, 2> Set;
+    Set.insert(0); // things tend to end up in zero
+    Set.insert(
+        DL.getAllocaAddrSpace()); // the argument should be in alloca addrspace
+    return Set;
+  }
+
+  // this could be partially target specific
+  bool expansionApplicableToFunction(Module &M, Function *F) {
+    if (F->isIntrinsic() || !F->isVarArg() ||
+        F->hasFnAttribute(Attribute::Naked)) {
+      return false;
+    }
+
+    // TODO: work out what to do with the cs_chain functions documented as
+    // non-variadic that are variadic in some lit tests
+    if (F->getCallingConv() != CallingConv::C)
+      return false;
+
+    if (!rewriteABI()) {
+      // e.g. can't replace a weak function unless changing the original symbol
+      if (GlobalValue::isInterposableLinkage(F->getLinkage())) {
+        return false;
+      }
+    }
+
+    if (!rewriteABI()) {
+      // If optimising, err on the side of leaving things alone
+      for (const Use &U : F->uses()) {
+        const auto *CB = dyn_cast<CallBase>(U.getUser());
+
+        if (!CB)
+          return false;
+
+        if (CB->isMustTailCall())
+          return false;
+
+        if (!CB->isCallee(&U) ||
+            CB->getFunctionType() != F->getFunctionType()) {
+          return false;
+        }
+      }
+    }
+
+    // Branch funnels look like variadic functions but aren't:
+    //
+    // define hidden void @__typeid_typeid1_0_branch_funnel(ptr nest %0, ...) {
+    //  musttail call void (...) @llvm.icall.branch.funnel(ptr %0, ptr @vt1_1,
+    //  ptr @vf1_1, ...) ret void
+    // }
+    //
+    // %1 = call i32 @__typeid_typeid1_0_branch_funnel(ptr nest %vtable, ptr
+    // %obj, i32 1)
+    //
+    // If this function contains a branch funnel intrinsic, don't transform it.
+
+    if (Function *Funnel =
+            getPreexistingDeclaration(&M, Intrinsic::icall_branch_funnel)) {
+      for (const User *U : Funnel->users()) {
+        if (auto *I = dyn_cast<CallBase>(U)) {
+          if (F == I->getFunction()) {
+            return false;
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+
+  bool callinstRewritable(CallBase *CB) {
+    if (CallInst *CI = dyn_cast<CallInst>(CB)) {
+      if (CI->isMustTailCall()) {
+        // Cannot expand musttail calls
+        if (rewriteABI()) {
+          // Todo: Sema?
+          report_fatal_error("Cannot lower musttail variadic call");
+        } else {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+  class ExpandedCallFrame {
+    // Helper for constructing an alloca instance containing the arguments bound
+    // to the variadic ... parameter, rearranged to allow indexing through a
+    // va_list iterator
+    //
+    // The awkward memory layout is to allow direct access to a contiguous array
+    // of types for the conversion to a struct type
+    enum { N = 4 };
+    SmallVector<Type *, N> FieldTypes;
+    enum Tag { IsByVal, NotByVal, Padding };
+    SmallVector<std::pair<Value *, Tag>, N> Fields;
+
+    template <Tag tag> void append(Type *T, Value *V) {
+      FieldTypes.push_back(T);
+      Fields.push_back({V, tag});
+    }
+
+  public:
+    void value(Type *T, Value *V) { append<NotByVal>(T, V); }
+
+    void byVal(Type *T, Value *V) { append<IsByVal>(T, V); }
+
+    void padding(LLVMContext &Ctx, uint64_t By) {
+      append<Padding>(ArrayType::get(Type::getInt8Ty(Ctx), By), nullptr);
+    }
+
+    size_t size() const { return FieldTypes.size(); }
+    bool empty() const { return FieldTypes.empty(); }
+
+    StructType *asStruct(LLVMContext &Ctx, StringRef Name) {
+      const bool IsPacked = true;
+      return StructType::create(Ctx, FieldTypes,
+                                (Twine(Name) + ".vararg").str(), IsPacked);
+    }
+
+    void initialiseStructAlloca(const DataLayout &DL, IRBuilder<> &Builder,
+                                AllocaInst *Alloced) {
+
+      StructType *VarargsTy = cast<StructType>(Alloced->getAllocatedType());
+
+      for (size_t I = 0; I < size(); I++) {
+        auto [V, tag] = Fields[I];
+        if (!V)
+          continue;
+
+        auto R = Builder.CreateStructGEP(VarargsTy, Alloced, I);
+        if (tag == IsByVal) {
+          Type *ByValType = FieldTypes[I];
+          Builder.CreateMemCpy(R, {}, V, {},
+                               DL.getTypeAllocSize(ByValType).getFixedValue());
+        } else {
+          Builder.CreateStore(V, R);
+        }
+      }
+    }
+  };
+};
+
+bool ExpandVariadics::runOnModule(Module &M) {
+  bool Changed = false;
+  if (Mode == ExpandVariadicsMode::disable)
+    return Changed;
+
+  llvm::Triple Triple(M.getTargetTriple());
+
+  if (Triple.getArch() == Triple::UnknownArch) {
+    // If we don't know the triple, we can't lower varargs
+    return false;
+  }
+
+  ABI = VariadicABIInfo::create(Triple);
+  if (!ABI) {
+    if (Mode == ExpandVariadicsMode::lowering) {
+      report_fatal_error(
+          "Requested variadic lowering is unimplemented on this target");
+    }
+    return Changed;
+  }
+
+  const DataLayout &DL = M.getDataLayout();
+  auto &Ctx = M.getContext();
+  IRBuilder<> Builder(Ctx);
+
+  // At pass input, va_start intrinsics only occur in variadic functions, as
+  // checked by the IR verifier.
+
+  // The lowering pass needs to run on all variadic functions.
+  // The optimise could run on only those that call va_start
+  // in exchange for additional book keeping to avoid transforming
+  // the same function multiple times when it contains multiple va_start.
+  // Leaving that compile time optimisation for a later patch.
+  for (Function &F : llvm::make_early_inc_range(M))
+    Changed |= runOnFunction(M, Builder, &F);
+
+  // After runOnFunction, all known calls to known variadic functions have been
+  // replaced. va_start intrinsics are presently (and invalidly!) only present
+  // in functions thart used to be variadic and have now been mutated to take a
+  // va_list instead. If lowering as opposed to optimising, calls to unknown
+  // variadic functions have also been replaced.
+
+  // Warning: Intrinsics acting on other ones are missed
+  auto CandidateAddressSpaces = supportedAddressSpaces(DL);
+
+  for (unsigned Addrspace : CandidateAddressSpaces) {
+    PointerType *ArgType = PointerType::get(Ctx, Addrspace);
+    Changed |= expandIntrinsicUsers<Intrinsic::vastart, VAStartInst>(M, Builder,
+                                                                     ArgType);
+    Changed |=
+        expandIntrinsicUsers<Intrinsic::vaend, VAEndInst>(M, Builder, ArgType);
+    Changed |= expandIntrinsicUsers<Intrinsic::vacopy, VACopyInst>(M, Builder,
+                                                                   ArgType);
+  }
+
+  // Variadic intrinsics are now gone. The va_start have been replaced with the
+  // equivalent of a va_copy from the newly appended va_list argument, va_end
+  // and va_copy are removed. All that remains is for the lowering pass to find
+  // indirect calls and rewrite those as well.
+
+  if (Mode == ExpandVariadicsMode::lowering) {
+    for (Function &F : llvm::make_early_inc_range(M)) {
+      if (F.isDeclaration())
+        continue;
+
+      // Now need to track down indirect calls. Can't find those
+      // by walking uses of variadic functions, need to crawl the instruction
+      // stream. Fortunately this is only necessary for the ABI rewrite case.
+      for (BasicBlock &BB : F) {
+        for (Instruction &I : llvm::make_early_inc_range(BB)) {
+          if (CallBase *CB = dyn_cast<CallBase>(&I)) {
+            if (CB->isIndirectCall()) {
+              FunctionType *FTy = CB->getFunctionType();
+              if (FTy->isVarArg()) {
+                Changed |= expandCall(M, Builder, CB, FTy, 0);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder,
+                                    Function *F) {
+  bool Changed = false;
+
+  // fprintf(stderr, "Called runOn: %s\n", F->getName().str().c_str());
+
+  // This check might be too coarse - there are probably cases where
+  // splitting a function is bad but it's usable without splitting
+  if (!expansionApplicableToFunction(M, F))
+    return false;
+
+  // TODO: Leave "thunk" attribute functions alone?
+
+  // Need more tests than this. Weak etc. Some are in expansionApplicable.
+  if (F->isDeclaration() && !rewriteABI()) {
+    return false;
+  }
+
+  // TODO: Is the lazy construction here still useful?
+  Function *Equivalent = deriveInlinableVariadicFunctionPair(M, Builder, *F);
+
+  for (User *U : llvm::make_early_inc_range(F->users())) {
+    // TODO: A test where the call instruction takes a variadic function as
+    // a parameter other than the one it is calling
+    if (CallBase *CB = dyn_cast<CallBase>(U)) {
+      Value *calledOperand = CB->getCalledOperand();
+      if (F == calledOperand) {
+        Changed |= expandCall(M, Builder, CB, F->getFunctionType(), Equivalent);
+      }
+    }
+  }
+
+  if (rewriteABI()) {
+    // No direct calls remain to F, remaining uses are things like address
+    // escaping, modulo errors in this implementation.
+    for (User *U : llvm::make_early_inc_range(F->users()))
+      if (CallBase *CB = dyn_cast<CallBase>(U)) {
+        Value *calledOperand = CB->getCalledOperand();
+        if (F == calledOperand) {
+          report_fatal_error(
+              "ExpandVA abi requires eliminating call uses first\n");
+        }
+      }
+
+    Changed = true;
+    // Converting the original variadic function in-place into the equivalent
+    // one.
+    Equivalent->setLinkage(F->getLinkage());
+    Equivalent->setVisibility(F->getVisibility());
+    Equivalent->takeName(F);
+
+    // Indirect calls still need to be patched up
+    // DAE bitcasts it, todo: check block addresses
+    F->replaceAllUsesWith(Equivalent);
+    F->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+Function *ExpandVariadics::deriveInlinableVariadicFunctionPair(
+    Module &M, IRBuilder<> &Builder, Function &F) {
+  // The purpose here is split the variadic function F into two functions
+  // One is a variadic function that bundles the passed argument into a va_list
+  // and passes it to the second function. The second function does whatever
+  // the original F does, except that it takes a va_list instead of the ...
+
+  assert(expansionApplicableToFunction(M, &F));
+
+  auto &Ctx = M.getContext();
+  const DataLayout &DL = M.getDataLayout();
+
+  // Returned value isDeclaration() is equal to F.isDeclaration()
+  // but that invariant is not satisfied throughout this function
+  const bool FunctionIsDefinition = !F.isDeclaration();
+
+  FunctionType *FTy = F.getFunctionType();
+  SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end());
+  ArgTypes.push_back(ABI.VAList->vaListParameterType(M));
+
+  FunctionType *NFTy = inlinableVariadicFunctionType(M, F.getFunctionType());
+  Function *NF = Function::Create(NFTy, F.getLinkage(), F.getAddressSpace());
+
+  // Note - same attribute handling as DeadArgumentElimination
+  NF->copyAttributesFrom(&F);
+  NF->setComdat(F.getComdat()); // beware weak
+  F.getParent()->getFunctionList().insert(F.getIterator(), NF);
+  NF->setName(F.getName() + ".valist");
+  NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
+
+  // New function is default visibility and internal
+  // Need to set visibility before linkage to avoid an assert in setVisibility
+  NF->setVisibility(GlobalValue::DefaultVisibility);
+  NF->setLinkage(GlobalValue::InternalLinkage);
+
+  AttrBuilder ParamAttrs(Ctx);
+  ParamAttrs.addAttribute(Attribute::NoAlias);
+
+  // TODO: When can the va_list argument have addAlignmentAttr called on it?
+  // It improves codegen lot in the non-inlined case. Probably target
+  // specific.
+
+  AttributeList Attrs = NF->getAttributes();
+  Attrs = Attrs.addParamAttributes(Ctx, NFTy->getNumParams() - 1, ParamAttrs);
+  NF->setAttributes(Attrs);
+
+  // Splice the implementation into the new function with minimal changes
+  if (FunctionIsDefinition) {
+    NF->splice(NF->begin(), &F);
+
+    auto NewArg = NF->arg_begin();
+    for (Argument &Arg : F.args()) {
+      Arg.replaceAllUsesWith(NewArg);
+      NewArg->setName(Arg.getName()); // takeName without killing the old one
+      ++NewArg;
+    }
+    NewArg->setName("varargs");
+  }
+
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  F.getAllMetadata(MDs);
+  for (auto [KindID, Node] : MDs)
+    NF->addMetadata(KindID, *Node);
+
+  if (FunctionIsDefinition) {
+    // The blocks have been stolen so it's now a declaration
+    assert(F.isDeclaration());
+    Type *VaListTy = ABI.VAList->vaListType(Ctx);
+
+    auto *BB = BasicBlock::Create(Ctx, "entry", &F);
+    Builder.SetInsertPoint(BB);
+
+    Value *VaListInstance = Builder.CreateAlloca(VaListTy, nullptr, "va_list");
+
+    Builder.CreateIntrinsic(Intrinsic::vastart, {DL.getAllocaPtrType(Ctx)},
+                            {VaListInstance});
+
+    SmallVector<Value *> Args;
+    for (Argument &A : F.args())
+      Args.push_back(&A);
+
+    // Shall we put the extra arg in alloca addrspace? Probably yes
+    VaListInstance = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        VaListInstance, ABI.VAList->vaListParameterType(M));
+    Args.push_back(VaListInstance);
+
+    CallInst *Result = Builder.CreateCall(NF, Args);
+    Result->setTailCallKind(CallInst::TCK_Tail);
+
+    assert(ABI.VAList->vaEndIsNop()); // If this changes, insert a va_end here
+
+    if (Result->getType()->isVoidTy())
+      Builder.CreateRetVoid();
+    else
+      Builder.CreateRet(Result);
+  }
+
+  assert(F.isDeclaration() == NF->isDeclaration());
+
+  return NF;
+}
+
+bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
+                                 FunctionType *VarargFunctionType,
+                                 Function *NF) {
+  bool Changed = false;
+  const DataLayout &DL = M.getDataLayout();
+
+  if (!callinstRewritable(CB)) {
+    return Changed;
+  }
+
+  // This is something of a problem because the call instructions' idea of the
+  // function type doesn't necessarily match reality, before or after this
+  // pass
+  // Since the plan here is to build a new instruction there is no
+  // particular benefit to trying to preserve an incorrect initial type
+  // If the types don't match and we aren't changing ABI, leave it alone
+  // in case someone is deliberately doing dubious type punning through a
+  // varargs.
+  FunctionType *FuncType = CB->getFunctionType();
+  if (FuncType != VarargFunctionType) {
+    if (!rewriteABI()) {
+      return Changed;
+    }
+    FuncType = VarargFunctionType;
+  }
+
+  auto &Ctx = CB->getContext();
+
+  // Align the struct on ABI.MinAlign to start with
+  Align MaxFieldAlign(ABI.MinAlign ? ABI.MinAlign : 1);
+
+  // The strategy here is to allocate a call frame containing the variadic
+  // arguments laid out such that a target specific va_list can be initialised
+  // with it, such that target specific va_arg instructions will correctly
+  // iterate over it. Primarily this means getting the alignment right.
+
+  ExpandedCallFrame Frame;
+
+  uint64_t CurrentOffset = 0;
+  for (unsigned I = FuncType->getNumParams(), E = CB->arg_size(); I < E; ++I) {
+    Value *ArgVal = CB->getArgOperand(I);
+    bool IsByVal = CB->paramHasAttr(I, Attribute::ByVal);
+    Type *ArgType = IsByVal ? CB->getParamByValType(I) : ArgVal->getType();
+    Align DataAlign = DL.getABITypeAlign(ArgType);
+
+    uint64_t DataAlignV = DataAlign.value();
+
+    // Currently using 0 as a sentinel to mean ignored
+    if (ABI.MinAlign && DataAlignV < ABI.MinAlign)
+      DataAlignV = ABI.MinAlign;
+    if (ABI.MaxAlign && DataAlignV > ABI.MaxAlign)
+      DataAlignV = ABI.MaxAlign;
+
+    DataAlign = Align(DataAlignV);
+    MaxFieldAlign = std::max(MaxFieldAlign, DataAlign);
+
+    if (uint64_t Rem = CurrentOffset % DataAlignV) {
+      // Inject explicit padding to deal with alignment requirements
+      uint64_t Padding = DataAlignV - Rem;
+      Frame.padding(Ctx, Padding);
+      CurrentOffset += Padding;
+    }
+
+    if (IsByVal) {
+      Frame.byVal(ArgType, ArgVal);
+    } else {
+      Frame.value(ArgType, ArgVal);
+    }
+    CurrentOffset += DL.getTypeAllocSize(ArgType).getFixedValue();
+  }
+
+  if (Frame.empty()) {
+    // Not passing any arguments, hopefully va_arg won't try to read any
+    // Creating a single byte frame containing nothing to point the va_list
+    // instance as that is less special-casey in the compiler and probably
+    // easier to interpret in a debugger.
+    Frame.padding(Ctx, 1);
+  }
+
+  Function *CBF = CB->getParent()->getParent();
+
+  StructType *VarargsTy = Frame.asStruct(Ctx, CBF->getName());
+
+  // Put the alloca to hold the variadic args in the entry basic block.
+  // The clumsy construction is to set a the alignment on the instance
+  Builder.SetInsertPointPastAllocas(CBF);
+
+  // The struct instance needs to be at least MaxFieldAlign for the alignment of
+  // the fields to be correct at runtime. Use the native stack alignment instead
+  // if that's greater as that tends to give better codegen.
+  Align AllocaAlign = MaxFieldAlign;
+  if (DL.exceedsNaturalStackAlignment(Align(1024))) {
----------------
arsenm wrote:

Where did 1024 come from? 

https://github.com/llvm/llvm-project/pull/89007