[clang] [llvm] [WIP] Expand variadic functions in IR (PR #89007)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 17 03:35:17 PDT 2024
================
@@ -0,0 +1,1056 @@
+//===-- ExpandVariadicsPass.cpp --------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an optimization pass for variadic functions. If called from codegen,
+// it can serve as the implementation of variadic functions for a given target.
+//
+// The strategy is to turn the ... part of a varidic function into a va_list
+// and fix up the call sites. This is completely effective if the calling
+// convention can declare that to be the right thing, e.g. on GPUs or where
+// the application is wholly statically linked. In the usual case, it will
+// replace known calls to known variadic functions with calls that are amenable
+// to inlining and other optimisations.
+//
+// The target-dependent parts are in class VariadicABIInfo. Enabling a new
+// target means adding a case to VariadicABIInfo::create() along with tests.
+// This will be especially simple if the va_list representation is a char*.
+//
+// The majority of the plumbing is splitting the variadic function into a
+// single basic block that packs the variadic arguments into a va_list and
+// a second function that does the work of the original. The target specific
+// part is packing arguments into a contiguous buffer that the clang expansion
+// of va_arg will do the right thing with.
+//
+// The aggregate effect is to unblock other transforms, most critically the
+// general purpose inliner. Known calls to variadic functions become zero cost.
+//
+// Consistency with clang is primarily tested by emitting va_arg using clang
+// then expanding the variadic functions using this pass, followed by trying
+// to constant fold the functions to no-ops.
+//
+// Target specific behaviour is tested in IR - mainly checking that values are
+// put into positions in call frames that make sense for that particular target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/TargetParser/Triple.h"
+
+#include <cstdio>
+
+#define DEBUG_TYPE "expand-variadics"
+
+using namespace llvm;
+
+cl::opt<ExpandVariadicsMode> ExpandVariadicsModeOption(
+ DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE),
+ cl::init(ExpandVariadicsMode::unspecified),
+ cl::values(clEnumValN(ExpandVariadicsMode::unspecified, "unspecified",
+ "Use the implementation defaults"),
+ clEnumValN(ExpandVariadicsMode::disable, "disable",
+ "Disable the pass entirely"),
+ clEnumValN(ExpandVariadicsMode::optimize, "optimize",
+ "Optimise without changing ABI"),
+ clEnumValN(ExpandVariadicsMode::lowering, "lowering",
+ "Change variadic calling convention")));
+
+namespace {
+
+// Module implements getFunction() which returns nullptr on missing declaration
+// and getOrInsertFunction which creates one when absent. Intrinsics.h
+// implements getDeclaration which creates one when missing. This should be
+// changed to be consistent with Module()'s naming. Implementing as a local
+// function here in the meantime to decouple from that process.
+Function *getPreexistingDeclaration(Module *M, Intrinsic::ID id,
+ ArrayRef<Type *> Tys = std::nullopt) {
+ auto *FT = Intrinsic::getType(M->getContext(), id, Tys);
+ return M->getFunction(Tys.empty() ? Intrinsic::getName(id)
+ : Intrinsic::getName(id, Tys, M, FT));
+}
+
+// Lots of targets use a void* pointed at a buffer for va_list.
+// Some use more complicated iterator constructs. Type erase that
+// so the rest of the pass can operation on either.
+// Virtual functions where different targets want different behaviour,
+// normal where all implemented targets presently have the same.
+struct VAListInterface {
+ virtual ~VAListInterface() {}
+
+ // Whether a valist instance is passed by value or by address
+ // I.e. does it need to be alloca'ed and stored into, or can
+ // it be passed directly in a SSA register
+ virtual bool passedInSSARegister() = 0;
+
+ // The type of a va_list iterator object
+ virtual Type *vaListType(LLVMContext &Ctx) = 0;
+
+ // The type of a va_list as a function argument as lowered by C
+ virtual Type *vaListParameterType(Module &M) = 0;
+
+ // Initialise an allocated va_list object to point to an already
+ // initialised contiguous memory region.
+ // Return the value to pass as the va_list argument
+ virtual Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+ AllocaInst *, Value * /*buffer*/) = 0;
+
+ // Simple lowering suffices for va_end, va_copy for current targets
+ bool vaEndIsNop() { return true; }
+ bool vaCopyIsMemcpy() { return true; }
+};
+
+// The majority case - a void* into an alloca
+struct VoidPtr final : public VAListInterface {
+ bool passedInSSARegister() override { return true; }
+
+ Type *vaListType(LLVMContext &Ctx) override {
+ return PointerType::getUnqual(Ctx);
+ }
+
+ Type *vaListParameterType(Module &M) override {
+ return PointerType::getUnqual(M.getContext());
+ }
+
+ Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+ AllocaInst * /*va_list*/, Value *buffer) override {
+ return buffer;
+ }
+};
+
+struct VoidPtrAllocaAddrspace final : public VAListInterface {
+
+ bool passedInSSARegister() override { return true; }
+
+ Type *vaListType(LLVMContext &Ctx) override {
+ return PointerType::getUnqual(Ctx);
+ }
+
+ Type *vaListParameterType(Module &M) override {
+ const DataLayout &DL = M.getDataLayout();
+ return DL.getAllocaPtrType(M.getContext());
+ }
+
+ Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+ AllocaInst * /*va_list*/, Value *buffer) override {
+ return buffer;
+ }
+};
+
+// SystemV as used by X64 Linux and others
+struct SystemV final : public VAListInterface {
+ bool passedInSSARegister() override { return false; }
+
+ Type *vaListType(LLVMContext &Ctx) override {
+ auto I32 = Type::getInt32Ty(Ctx);
+ auto Ptr = PointerType::getUnqual(Ctx);
+ return ArrayType::get(StructType::get(Ctx, {I32, I32, Ptr, Ptr}), 1);
+ }
+
+ Type *vaListParameterType(Module &M) override {
+ return PointerType::getUnqual(M.getContext());
+ }
+
+ Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+ AllocaInst *VaList, Value *VoidBuffer) override {
+ assert(VaList->getAllocatedType() == vaListType(Ctx));
+
+ Type *VaListTy = vaListType(Ctx);
+
+ Type *I32 = Type::getInt32Ty(Ctx);
+ Type *I64 = Type::getInt64Ty(Ctx);
+
+ Value *Idxs[3] = {
+ ConstantInt::get(I64, 0),
+ ConstantInt::get(I32, 0),
+ nullptr,
+ };
+
+ Idxs[2] = ConstantInt::get(I32, 0);
+ Builder.CreateStore(
+ ConstantInt::get(I32, 48),
+ Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "gp_offset"));
+
+ Idxs[2] = ConstantInt::get(I32, 1);
+ Builder.CreateStore(
+ ConstantInt::get(I32, 6 * 8 + 8 * 16),
+ Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "fp_offset"));
+
+ Idxs[2] = ConstantInt::get(I32, 2);
+ Builder.CreateStore(
+ VoidBuffer,
+ Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "overfow_arg_area"));
+
+ Idxs[2] = ConstantInt::get(I32, 3);
+ Builder.CreateStore(
+ ConstantPointerNull::get(PointerType::getUnqual(Ctx)),
+ Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "reg_save_area"));
+
+ return VaList;
+ }
+};
+
+class VariadicABIInfo {
+
+ VariadicABIInfo(uint32_t MinAlign, uint32_t MaxAlign,
+ std::unique_ptr<VAListInterface> VAList)
+ : MinAlign(MinAlign), MaxAlign(MaxAlign), VAList(std::move(VAList)) {}
+
+ template <typename T>
+ static VariadicABIInfo create(uint32_t MinAlign, uint32_t MaxAlign) {
+ return {MinAlign, MaxAlign, std::make_unique<T>()};
+ }
+
+public:
+ const uint32_t MinAlign;
+ const uint32_t MaxAlign;
+ std::unique_ptr<VAListInterface> VAList;
+
+ VariadicABIInfo() : VariadicABIInfo(0, 0, nullptr) {}
+ explicit operator bool() const { return static_cast<bool>(VAList); }
+
+ VariadicABIInfo(VariadicABIInfo &&Self)
+ : MinAlign(Self.MinAlign), MaxAlign(Self.MaxAlign),
+ VAList(Self.VAList.release()) {}
+
+ VariadicABIInfo &operator=(VariadicABIInfo &&Other) {
+ this->~VariadicABIInfo();
+ new (this) VariadicABIInfo(std::move(Other));
+ return *this;
+ }
+
+ static VariadicABIInfo create(llvm::Triple const &Triple) {
+ const bool IsLinuxABI = Triple.isOSLinux() || Triple.isOSCygMing();
+
+ switch (Triple.getArch()) {
+
+ case Triple::r600:
+ case Triple::amdgcn: {
+ return create<VoidPtrAllocaAddrspace>(1, 0);
+ }
+
+ case Triple::nvptx:
+ case Triple::nvptx64: {
+ return create<VoidPtr>(4, 0);
+ }
+
+ case Triple::x86: {
+ // These seem to all fall out the same, despite getTypeStackAlign
+ // implying otherwise.
+
+ if (Triple.isOSDarwin()) {
+ // X86_32ABIInfo::getTypeStackAlignInBytes is misleading for this.
+ // The slotSize(4) implies a minimum alignment
+ // The AllowHigherAlign = true means there is no maximum alignment.
+
+ return create<VoidPtr>(4, 0);
+ }
+ if (Triple.getOS() == llvm::Triple::Win32) {
+ return create<VoidPtr>(4, 0);
+ }
+
+ if (IsLinuxABI) {
+ return create<VoidPtr>(4, 0);
+ }
+
+ break;
+ }
+
+ case Triple::x86_64: {
+ if (Triple.isWindowsMSVCEnvironment() || Triple.isOSWindows()) {
+ // x64 msvc emit vaarg passes > 8 byte values by pointer
+ // however the variadic call instruction created does not, e.g.
+ // a <4 x f32> will be passed as itself, not as a pointer or byval.
+ // Postponing resolution of that for now.
+ // Expected min/max align of 8.
+ return {};
+ }
+
+ // SystemV X64 documented behaviour:
+ // Slots are at least eight byte aligned and at most 16 byte aligned.
+ // If the type needs more than sixteen byte alignment, it still only gets
+ // that much alignment on the stack.
+ // X64 behaviour in clang:
+ // Slots are at least eight byte aligned and at most naturally aligned
+ // This matches clang, not the ABI docs.
+
+ if (Triple.isOSDarwin()) {
+ return create<SystemV>(8, 8);
+ }
+
+ if (IsLinuxABI) {
+ return create<SystemV>(8, 8);
+ }
+
+ break;
+ }
+
+ default:
+ break;
+ }
+
+ return {};
+ }
+};
+
+class ExpandVariadics : public ModulePass {
+
+ // The pass construction sets the default (optimize when called from middle
+ // end, lowering when called from the backend). The command line variable
+ // overrides that. This is useful for testing and debugging. It also allows
+ // building an applications with variadic functions wholly removed if one
+ // has sufficient control over the dependencies, e.g. a statically linked
+ // clang that has no variadic function calls remaining in the binary.
+ static ExpandVariadicsMode
+ withCommandLineOverride(ExpandVariadicsMode LLVMRequested) {
+ ExpandVariadicsMode UserRequested = ExpandVariadicsModeOption;
+ return (UserRequested == ExpandVariadicsMode::unspecified) ? LLVMRequested
+ : UserRequested;
+ }
+
+public:
+ static char ID;
+ const ExpandVariadicsMode Mode;
+ VariadicABIInfo ABI;
+
+ ExpandVariadics(ExpandVariadicsMode Mode)
+ : ModulePass(ID), Mode(withCommandLineOverride(Mode)) {}
+ StringRef getPassName() const override { return "Expand variadic functions"; }
+
+ // Rewrite a variadic call site
+ bool expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, FunctionType *,
+ Function *NF);
+
+ // Given a variadic function, return a function taking a va_list that can be
+ // called instead of the original. Mutates F.
+ Function *deriveInlinableVariadicFunctionPair(Module &M, IRBuilder<> &Builder,
+ Function &F);
+
+ bool runOnFunction(Module &M, IRBuilder<> &Builder, Function *F);
+
+ // Entry point
+ bool runOnModule(Module &M) override;
+
+ bool rewriteABI() { return Mode == ExpandVariadicsMode::lowering; }
+
+ void memcpyVAListPointers(const DataLayout &DL, IRBuilder<> &Builder,
+ Value *Dst, Value *Src) {
+ auto &Ctx = Builder.getContext();
+ Type *VaListTy = ABI.VAList->vaListType(Ctx);
+ uint64_t Size = DL.getTypeAllocSize(VaListTy).getFixedValue();
+ // todo: on amdgcn this should be in terms of addrspace 5
+ Builder.CreateMemCpyInline(Dst, {}, Src, {},
+ ConstantInt::get(Type::getInt32Ty(Ctx), Size));
+ }
+
+ bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL,
+ VAStartInst *Inst);
+
+ bool expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &,
+ VAEndInst *Inst);
+
+ bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL,
+ VACopyInst *Inst);
+
+ template <Intrinsic::ID ID, typename InstructionType>
+ bool expandIntrinsicUsers(Module &M, IRBuilder<> &Builder,
+ PointerType *ArgType) {
+ bool Changed = false;
+ const DataLayout &DL = M.getDataLayout();
+ if (Function *Intrinsic = getPreexistingDeclaration(&M, ID, {ArgType})) {
+ for (User *U : Intrinsic->users()) {
+ if (auto *I = dyn_cast<InstructionType>(U)) {
+ Changed |= expandVAIntrinsicCall(Builder, DL, I);
+ }
+ }
+ if (Intrinsic->use_empty())
+ Intrinsic->eraseFromParent();
+ }
+ return Changed;
+ }
+
+ FunctionType *inlinableVariadicFunctionType(Module &M, FunctionType *FTy) {
+ SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end());
+ ArgTypes.push_back(ABI.VAList->vaListParameterType(M));
+ return FunctionType::get(FTy->getReturnType(), ArgTypes,
+ /*IsVarArgs*/ false);
+ }
+
+ static ConstantInt *sizeOfAlloca(LLVMContext &Ctx, const DataLayout &DL,
+ AllocaInst *Alloced) {
+ Type *AllocaType = Alloced->getAllocatedType();
+ TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
+ uint64_t AsInt = AllocaTypeSize.getFixedValue();
+ return ConstantInt::get(Type::getInt64Ty(Ctx), AsInt);
+ }
+
+ static SmallSet<unsigned, 2> supportedAddressSpaces(const DataLayout &DL) {
+ // FIXME: It looks like a module can contain arbitrary integers for address
+ // spaces in which case we might need to check _lots_ of cases. Maybe add a
+ // rule to the verifier that the vastart/vaend intrinsics can have arguments
+ // in 0 or in allocaaddrspace but nowhere else
+ SmallSet<unsigned, 2> Set;
+ Set.insert(0); // things tend to end up in zero
+ Set.insert(
+ DL.getAllocaAddrSpace()); // the argument should be in alloca addrspace
+ return Set;
+ }
+
+ // this could be partially target specific
+ bool expansionApplicableToFunction(Module &M, Function *F) {
+ if (F->isIntrinsic() || !F->isVarArg() ||
+ F->hasFnAttribute(Attribute::Naked)) {
+ return false;
+ }
+
+ // TODO: work out what to do with the cs_chain functions documented as
+ // non-variadic that are variadic in some lit tests
+ if (F->getCallingConv() != CallingConv::C)
+ return false;
+
+ if (!rewriteABI()) {
+ // e.g. can't replace a weak function unless changing the original symbol
+ if (GlobalValue::isInterposableLinkage(F->getLinkage())) {
+ return false;
+ }
+ }
+
+ if (!rewriteABI()) {
+ // If optimising, err on the side of leaving things alone
----------------
arsenm wrote:
Move all this to an allUsesAreTrivialCalls helper function?
https://github.com/llvm/llvm-project/pull/89007
More information about the llvm-commits
mailing list