[clang] [llvm] [transforms] Inline simple variadic functions (PR #81058)

Wed Feb 7 22:52:29 PST 2024

================
@@ -0,0 +1,698 @@
+//===-- ExpandVariadicsPass.cpp --------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an optimisation pass for variadic functions. If called from codegen,
+// it can serve as the implementation of variadic functions for a given target.
+//
+// The target-dependent parts are in namespace VariadicABIInfo. Enabling a new
+// target means adding a case to VariadicABIInfo::create() along with tests.
+//
+// The module pass using that information is class ExpandVariadics.
+//
+// The strategy is:
+// 1. Test whether a variadic function is sufficiently simple
+// 2. If it was, calls to it can be replaced with calls to a different function
+// 3. If it wasn't, try to split it into a simple function and a remainder
+// 4. Optionally rewrite the varadic function calling convention as well
+//
+// This pass considers "sufficiently simple" to mean a variadic function that
+// calls into a different function taking a va_list to do the real work. For
+// example, libc might implement fprintf as a single basic block calling into
+// vfprintf. This pass can then rewrite call to the variadic into some code
+// to construct a target-specific value to use for the va_list and a call
+// into the non-variadic implementation function. There's a test for that.
+//
+// Most other variadic functions whose definition is known can be converted into
+// that form. Create a new internal function taking a va_list where the original
+// took a ... parameter. Move the blocks across. Create a new block containing a
+// va_start that calls into the new function. This is nearly target independent.
+//
+// Where this transform is consistent with the ABI, e.g. AMDGPU or NVPTX, or
+// where the ABI can be chosen to align with this transform, the function
+// interface can be rewritten along with calls to unknown variadic functions.
+//
+// The aggregate effect is to unblock other transforms, most critically the
+// general purpose inliner. Known calls to variadic functions become zero cost.
+//
+// This pass does define some target specific information which is partially
+// redundant with other parts of the compiler. In particular, the call frame
+// it builds must be the exact complement of the va_arg lowering performed
+// by clang. The va_list construction is similar to work done by the backend
+// for targets that lower variadics there, though distinct in that this pass
+// constructs the pieces using alloca instead of relative to stack pointers.
+//
+// Consistency with clang is primarily tested by emitting va_arg using clang
+// then expanding the variadic functions using this pass, followed by trying
+// to constant fold the functions to no-ops.
+//
+// Target specific behaviour is tested in IR - mainly checking that values are
+// put into positions in call frames that make sense for that particular target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/TargetParser/Triple.h"
+
+#define DEBUG_TYPE "expand-variadics"
+
+using namespace llvm;
+
+namespace {
+namespace VariadicABIInfo {
+
+// calling convention for passing as valist object, same as it would be in C
+// aarch64 uses byval
+enum class valistCC { value, pointer, /*byval*/ };
+
+struct Interface {
+protected:
+  Interface(uint32_t MinAlign, uint32_t MaxAlign)
+      : MinAlign(MinAlign), MaxAlign(MaxAlign) {}
+
+public:
+  virtual ~Interface() {}
+  const uint32_t MinAlign;
+  const uint32_t MaxAlign;
+
+  // Most ABIs use a void* or char* for va_list, others can specialise
+  virtual Type *vaListType(LLVMContext &Ctx) {
+    return PointerType::getUnqual(Ctx);
+  }
+
+  // How the vaListType is passed
+  virtual valistCC vaListCC() { return valistCC::value; }
+
+  // The valist might need to be stack allocated.
+  virtual bool valistOnStack() { return false; }
+
+  virtual void initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+                                AllocaInst * /*va_list*/, Value * /*buffer*/) {
+    // Function needs to be implemented if valist is on the stack
+    assert(!valistOnStack());
+    __builtin_unreachable();
+  }
+
+  // All targets currently implemented use a ptr for the valist parameter
+  Type *vaListParameterType(LLVMContext &Ctx) {
+    return PointerType::getUnqual(Ctx);
+  }
+
+  bool VAEndIsNop() { return true; }
+
+  bool VACopyIsMemcpy() { return true; }
+};
+
+struct X64SystemV final : public Interface {
+  // X64 documented behaviour:
+  // Slots are at least eight byte aligned and at most 16 byte aligned.
+  // If the type needs more than sixteen byte alignment, it still only gets
+  // that much alignment on the stack.
+  // X64 behaviour in clang:
+  // Slots are at least eight byte aligned and at most naturally aligned
+  // This matches clang, not the ABI docs.
+  X64SystemV() : Interface(8, 0) {}
+
+  Type *vaListType(LLVMContext &Ctx) override {
+    auto I32 = Type::getInt32Ty(Ctx);
+    auto Ptr = PointerType::getUnqual(Ctx);
+    return ArrayType::get(StructType::get(Ctx, {I32, I32, Ptr, Ptr}), 1);
+  }
+  valistCC vaListCC() override { return valistCC::pointer; }
+
+  bool valistOnStack() override { return true; }
+
+  void initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+                        AllocaInst *va_list, Value *voidBuffer) override {
+    assert(valistOnStack());
+    assert(va_list != nullptr);
+    assert(va_list->getAllocatedType() == vaListType(Ctx));
+
+    Type *va_list_ty = vaListType(Ctx);
+
+    Type *I32 = Type::getInt32Ty(Ctx);
+    Type *I64 = Type::getInt64Ty(Ctx);
+
+    Value *Idxs[3] = {
+        ConstantInt::get(I64, 0),
+        ConstantInt::get(I32, 0),
+        nullptr,
+    };
+
+    Idxs[2] = ConstantInt::get(I32, 0);
+    Builder.CreateStore(
+        ConstantInt::get(I32, 48),
+        Builder.CreateInBoundsGEP(va_list_ty, va_list, Idxs, "gp_offset"));
+
+    Idxs[2] = ConstantInt::get(I32, 1);
+    Builder.CreateStore(
+        ConstantInt::get(I32, 6 * 8 + 8 * 16),
+        Builder.CreateInBoundsGEP(va_list_ty, va_list, Idxs, "fp_offset"));
+
+    Idxs[2] = ConstantInt::get(I32, 2);
+    Builder.CreateStore(voidBuffer,
+                        Builder.CreateInBoundsGEP(va_list_ty, va_list, Idxs,
+                                                  "overfow_arg_area"));
+
+    Idxs[2] = ConstantInt::get(I32, 3);
+    Builder.CreateStore(
+        ConstantPointerNull::get(PointerType::getUnqual(Ctx)),
+        Builder.CreateInBoundsGEP(va_list_ty, va_list, Idxs, "reg_save_area"));
+  }
+};
+
+std::unique_ptr<Interface> create(Module &M) {
+  llvm::Triple Triple(M.getTargetTriple());
+  const bool IsLinuxABI = Triple.isOSLinux() || Triple.isOSCygMing();
+
+  switch (Triple.getArch()) {
+
+  case Triple::x86: {
+    // These seem to all fall out the same, despite getTypeStackAlign
+    // implying otherwise.
+    if (Triple.isOSDarwin()) {
+      struct X86Darwin final : public Interface {
+        // X86_32ABIInfo::getTypeStackAlignInBytes is misleading for this.
+        // The slotSize(4) implies a minimum alignment
+        // The AllowHigherAlign = true means there is no maximum alignment.
+        X86Darwin() : Interface(4, 0) {}
+      };
+
+      return std::make_unique<X86Darwin>();
+    }
+    if (Triple.getOS() == llvm::Triple::Win32) {
+      struct X86Windows final : public Interface {
+        X86Windows() : Interface(4, 0) {}
+      };
+      return std::make_unique<X86Windows>();
+    }
+
+    if (IsLinuxABI) {
+      struct X86Linux final : public Interface {
+        X86Linux() : Interface(4, 0) {}
+      };
+      return std::make_unique<X86Linux>();
+    }
+    break;
+  }
+
+  case Triple::x86_64: {
+    if (Triple.isWindowsMSVCEnvironment() || Triple.isOSWindows()) {
+      struct X64Windows final : public Interface {
+        X64Windows() : Interface(8, 8) {}
+      };
+      // x64 msvc emit vaarg passes > 8 byte values by pointer
+      // however the variadic call instruction created does not, e.g.
+      // a <4 x f32> will be passed as itself, not as a pointer or byval.
+      // Postponing resolution of that for now.
+      return nullptr;
+    }
+
+    if (Triple.isOSDarwin()) {
+      return std::make_unique<VariadicABIInfo::X64SystemV>();
+    }
+
+    if (IsLinuxABI) {
+      return std::make_unique<VariadicABIInfo::X64SystemV>();
+    }
+
+    break;
+  }
+
+  default:
+    return nullptr;
+  }
+
+  return nullptr;
+}
+
+} // namespace VariadicABIInfo
+
+class ExpandVariadics : public ModulePass {
+public:
+  static char ID;
+  std::unique_ptr<VariadicABIInfo::Interface> ABI;
+
+  ExpandVariadics() : ModulePass(ID) {}
+  StringRef getPassName() const override { return "Expand variadic functions"; }
+
+  // A predicate in that return nullptr means false
+  // Returns the function target to use when inlining on success
+  Function *isFunctionInlinable(Module &M, Function *F);
+
+  // Rewrite a call site.
+  void ExpandCall(Module &M, CallInst *CB, Function *VarargF, Function *NF);
+
+  // this could be partially target specific
+  bool expansionApplicableToFunction(Module &M, Function *F) {
+    if (F->isIntrinsic() || !F->isVarArg() ||
+        F->hasFnAttribute(Attribute::Naked))
+      return false;
+
+    if (F->getCallingConv() != CallingConv::C)
+      return false;
+
+    if (GlobalValue::isInterposableLinkage(F->getLinkage()))
+      return false;
----------------
arsenm wrote:

Simpler as F->isDefinitionExact?

Don't we want to do this unconditionally for AMDGPU?

https://github.com/llvm/llvm-project/pull/81058