[clang] [llvm] [WIP] Expand variadic functions in IR (PR #89007)

Jon Chesterfield via cfe-commits cfe-commits at lists.llvm.org
Tue Apr 16 17:30:50 PDT 2024


https://github.com/JonChesterfield created https://github.com/llvm/llvm-project/pull/89007

Rewrite calls to variadic functions into calls to an equivalent non-variadic function.

This makes calls to known variadic functions a zero cost abstraction. The GPUs use it as a backend implementation.

The pass runs for implemented targets on known functions at O1 or above. For targets using it as codegen it runs from the backend on known and unknown functions. Interacts well with inlining so is scheduled shortly before the inliner.

Relative to the abandoned pull/81058, this is the whole end to end transform as opposed to a subset for easier review. The feedback on that is applied, in particular this makes no attempt to detect existing va_list functions, so in some cases it'll make the IR worse in a fashion that the inliner completely reverts. The ABI abstraction is reworked to be more orthogonal which hopefully makes it clear that the switch isn't worth scattering across the target classes.

Marked WIP because vector types in structs on X64 might be misbehaving. Still testing this - it's solid enough to build a working clang and libxml2 and should meet the use cases in gpu libc and I'll put a patch up for the rocm CI - but would like more test coverage and probably another architecture implemented before landing.

>From cb3aba6c8da4697c58b988e63f25cdadc479aaa8 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfield at gmail.com>
Date: Wed, 17 Apr 2024 00:32:12 +0100
Subject: [PATCH] [WIP] Expand variadic functions in IR

---
 clang/lib/CodeGen/ABIInfoImpl.cpp             |   13 +-
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |    8 +-
 clang/lib/CodeGen/Targets/NVPTX.cpp           |    8 +-
 clang/test/CodeGen/expand-variadic-call.c     |  314 +++++
 clang/test/CodeGen/variadic-wrapper-removal.c |   85 ++
 .../CodeGenCXX/inline-then-fold-variadics.cpp |  247 ++++
 llvm/include/llvm/InitializePasses.h          |    1 +
 .../llvm/Transforms/IPO/ExpandVariadics.h     |   43 +
 llvm/lib/Passes/PassBuilder.cpp               |    1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      |    4 +
 llvm/lib/Passes/PassRegistry.def              |    1 +
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |    1 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |    3 +
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp  |    4 +
 llvm/lib/Transforms/IPO/CMakeLists.txt        |    1 +
 llvm/lib/Transforms/IPO/ExpandVariadics.cpp   | 1056 +++++++++++++++++
 .../CodeGen/AMDGPU/expand-variadic-call.ll    |  499 ++++++++
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |    5 +
 llvm/test/CodeGen/AMDGPU/unsupported-calls.ll |   19 -
 .../CodeGen/NVPTX/expand-variadic-call.ll     |  468 ++++++++
 .../X86/expand-variadic-call-i386-darwin.ll   |  449 +++++++
 .../X86/expand-variadic-call-i386-linux.ll    |  449 +++++++
 .../X86/expand-variadic-call-i686-msvc.ll     |  467 ++++++++
 .../X86/expand-variadic-call-x64-darwin.ll    |  688 +++++++++++
 .../X86/expand-variadic-call-x64-linux.ll     |  688 +++++++++++
 llvm/test/Other/new-pm-defaults.ll            |    1 +
 .../Other/new-pm-thinlto-postlink-defaults.ll |    1 +
 .../new-pm-thinlto-postlink-pgo-defaults.ll   |    1 +
 ...-pm-thinlto-postlink-samplepgo-defaults.ll |    2 +-
 .../Other/new-pm-thinlto-prelink-defaults.ll  |    1 +
 .../new-pm-thinlto-prelink-pgo-defaults.ll    |    1 +
 ...w-pm-thinlto-prelink-samplepgo-defaults.ll |    1 +
 .../expand-va-intrinsic-outliner.ll           |   86 ++
 .../expand-va-intrinsic-split-linkage.ll      |  225 ++++
 .../expand-va-intrinsic-split-simple.ll       |  121 ++
 .../llvm/lib/Transforms/IPO/BUILD.gn          |    1 +
 36 files changed, 5939 insertions(+), 24 deletions(-)
 create mode 100644 clang/test/CodeGen/expand-variadic-call.c
 create mode 100644 clang/test/CodeGen/variadic-wrapper-removal.c
 create mode 100644 clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
 create mode 100644 llvm/include/llvm/Transforms/IPO/ExpandVariadics.h
 create mode 100644 llvm/lib/Transforms/IPO/ExpandVariadics.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/expand-variadic-call.ll
 create mode 100644 llvm/test/CodeGen/X86/expand-variadic-call-i386-darwin.ll
 create mode 100644 llvm/test/CodeGen/X86/expand-variadic-call-i386-linux.ll
 create mode 100644 llvm/test/CodeGen/X86/expand-variadic-call-i686-msvc.ll
 create mode 100644 llvm/test/CodeGen/X86/expand-variadic-call-x64-darwin.ll
 create mode 100644 llvm/test/CodeGen/X86/expand-variadic-call-x64-linux.ll
 create mode 100644 llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-outliner.ll
 create mode 100644 llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-linkage.ll
 create mode 100644 llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll

diff --git a/clang/lib/CodeGen/ABIInfoImpl.cpp b/clang/lib/CodeGen/ABIInfoImpl.cpp
index 3e34d82cb399ba..c4a1829ab8863f 100644
--- a/clang/lib/CodeGen/ABIInfoImpl.cpp
+++ b/clang/lib/CodeGen/ABIInfoImpl.cpp
@@ -154,11 +154,20 @@ llvm::Value *CodeGen::emitRoundPointerUpToAlignment(CodeGenFunction &CGF,
                                                     llvm::Value *Ptr,
                                                     CharUnits Align) {
   // OverflowArgArea = (OverflowArgArea + Align - 1) & -Align;
+  Ptr = CGF.Builder.CreateAddrSpaceCast(Ptr, CGF.AllocaInt8PtrTy,
+                                        Ptr->getName() + ".addrcast");
   llvm::Value *RoundUp = CGF.Builder.CreateConstInBoundsGEP1_32(
       CGF.Builder.getInt8Ty(), Ptr, Align.getQuantity() - 1);
+
+  // ptrmask is sensitive to the bitwidth of the mask
+  unsigned IndexTypeSize =
+      CGF.CGM.getDataLayout().getIndexTypeSizeInBits(RoundUp->getType());
+  llvm::IntegerType *MaskType =
+      llvm::IntegerType::get(CGF.getLLVMContext(), IndexTypeSize);
+
   return CGF.Builder.CreateIntrinsic(
-      llvm::Intrinsic::ptrmask, {CGF.AllocaInt8PtrTy, CGF.IntPtrTy},
-      {RoundUp, llvm::ConstantInt::get(CGF.IntPtrTy, -Align.getQuantity())},
+      llvm::Intrinsic::ptrmask, {CGF.AllocaInt8PtrTy, MaskType},
+      {RoundUp, llvm::ConstantInt::get(MaskType, -Align.getQuantity())},
       nullptr, Ptr->getName() + ".aligned");
 }
 
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 44e86c0b40f686..e274741a6ee652 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -115,7 +115,13 @@ void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
 
 Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
                                  QualType Ty) const {
-  llvm_unreachable("AMDGPU does not support varargs");
+  const bool IsIndirect = false;
+  const bool AllowHigherAlign = true;
+  // Would rather not naturally align values
+  // Splitting {char, short} into two separate arguments makes that difficult.
+  return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
+                          getContext().getTypeInfoInChars(Ty),
+                          CharUnits::fromQuantity(1), AllowHigherAlign);
 }
 
 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 7dce5042c3dc20..d6c256e5e6f7b9 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -215,7 +215,13 @@ void NVPTXABIInfo::computeInfo(CGFunctionInfo &FI) const {
 
 Address NVPTXABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
                                 QualType Ty) const {
-  llvm_unreachable("NVPTX does not support varargs");
+  // TODO: Work out to what extent this correlates with ptx
+  // doubles get passed with 8 byte alignment and C promotes smaller integer
+  // types to int. Printf doesn't really do structs so hard to guess what
+  // the right thing is for that.
+  return emitVoidPtrVAArg(CGF, VAListAddr, Ty, false,
+                          getContext().getTypeInfoInChars(Ty),
+                          CharUnits::fromQuantity(4), true);
 }
 
 void NVPTXTargetCodeGenInfo::setTargetAttributes(
diff --git a/clang/test/CodeGen/expand-variadic-call.c b/clang/test/CodeGen/expand-variadic-call.c
new file mode 100644
index 00000000000000..baff54544207ea
--- /dev/null
+++ b/clang/test/CodeGen/expand-variadic-call.c
@@ -0,0 +1,314 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-cpu x86-64-v4 -std=c23 -O1 -ffreestanding -mllvm --expand-variadics-override=disable -emit-llvm -o - %s | FileCheck %s
+
+// This test sanity checks calling a variadic function with the expansion transform disabled.
+// The IR test cases {arch}/expand-variadic-call-*.ll correspond to IR generated from this test case.
+
+typedef __builtin_va_list va_list;
+#define va_copy(dest, src) __builtin_va_copy(dest, src)
+#define va_start(ap, ...) __builtin_va_start(ap, 0)
+#define va_end(ap) __builtin_va_end(ap)
+#define va_arg(ap, type) __builtin_va_arg(ap, type)
+
+// 32 bit x86 alignment uses getTypeStackAlign for special cases
+// Whitebox testing.
+// Needs a type >= 16 which is either a simd or a struct containing a simd
+// darwinvectorabi should force 4 bytes
+// linux vectors with align 16/32/64 return that alignment
+
+
+// Might want various copy/end style constructs in a separate test
+
+void vararg(...);
+void valist(va_list);
+
+// CHECK-LABEL: @copy(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CP:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[CP]]) #[[ATTR7:[0-9]+]]
+// CHECK-NEXT:    call void @llvm.va_copy.p0(ptr nonnull [[CP]], ptr [[VA:%.*]])
+// CHECK-NEXT:    call void @valist(ptr noundef nonnull [[CP]]) #[[ATTR8:[0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[CP]]) #[[ATTR7]]
+// CHECK-NEXT:    ret void
+//
+void copy(va_list va)
+{
+  va_list cp;
+  va_copy(cp, va);
+  valist(cp);
+}
+
+// CHECK-LABEL: @start_once(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[S:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[S]]) #[[ATTR7]]
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[S]])
+// CHECK-NEXT:    call void @valist(ptr noundef nonnull [[S]]) #[[ATTR8]]
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[S]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[S]]) #[[ATTR7]]
+// CHECK-NEXT:    ret void
+//
+void start_once(...)
+{
+  va_list s;
+  va_start(s);
+  valist(s);
+  va_end(s);
+}
+
+// CHECK-LABEL: @start_twice(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[S0:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    [[S1:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[S0]]) #[[ATTR7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[S1]]) #[[ATTR7]]
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[S0]])
+// CHECK-NEXT:    call void @valist(ptr noundef nonnull [[S0]]) #[[ATTR8]]
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[S0]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[S1]])
+// CHECK-NEXT:    call void @valist(ptr noundef nonnull [[S1]]) #[[ATTR8]]
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[S1]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[S1]]) #[[ATTR7]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[S0]]) #[[ATTR7]]
+// CHECK-NEXT:    ret void
+//
+void start_twice(...)
+{
+  va_list s0,s1;
+  va_start(s0);
+  valist(s0);
+  va_end(s0);
+  va_start(s1);
+  valist(s1);
+  va_end(s1);
+}
+
+// vectors with alignment 16/32/64 are natively aligned on linux x86
+// v32f32 would be a m1024 type, larger than x64 defines at time of writing
+typedef int i32;
+typedef float v4f32 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef float v8f32 __attribute__((__vector_size__(32), __aligned__(32)));
+typedef float v16f32 __attribute__((__vector_size__(64), __aligned__(64)));
+typedef float v32f32 __attribute__((__vector_size__(128), __aligned__(128)));
+
+
+// Pass a single value to wrapped() via vararg(...)
+// CHECK-LABEL: @single_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(i32 noundef [[X:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void single_i32(i32 x)
+{
+  vararg(x);
+}
+
+
+// CHECK-LABEL: @single_double(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(double noundef [[X:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void single_double(double x)
+{
+  vararg(x);
+}
+
+// CHECK-LABEL: @single_v4f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(<4 x float> noundef [[X:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void single_v4f32(v4f32 x)
+{
+  vararg(x);
+}
+
+// CHECK-LABEL: @single_v8f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(<8 x float> noundef [[X:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void single_v8f32(v8f32 x)
+{
+  vararg(x);
+}
+
+// CHECK-LABEL: @single_v16f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(<16 x float> noundef [[X:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void single_v16f32(v16f32 x)
+{
+  vararg(x);
+}
+
+// CHECK-LABEL: @single_v32f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca <32 x float>, align 128
+// CHECK-NEXT:    [[X:%.*]] = load <32 x float>, ptr [[TMP0:%.*]], align 128, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    store <32 x float> [[X]], ptr [[INDIRECT_ARG_TEMP]], align 128, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void (...) @vararg(ptr noundef nonnull byval(<32 x float>) align 128 [[INDIRECT_ARG_TEMP]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void single_v32f32(v32f32 x)
+{
+  vararg(x);
+}
+
+
+
+// CHECK-LABEL: @i32_double(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(i32 noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void i32_double(i32 x, double y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @double_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void double_i32(double x, i32 y)
+{
+  vararg(x, y);
+}
+
+
+// A struct used by libc variadic tests
+
+typedef struct {
+  char c;
+  short s;
+  int i;
+  long l;
+  float f;
+  double d;
+}  libcS;
+
+// CHECK-LABEL: @i32_libcS(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(i32 noundef [[X:%.*]], ptr noundef nonnull byval([[STRUCT_LIBCS:%.*]]) align 8 [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void i32_libcS(i32 x, libcS y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @libcS_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(ptr noundef nonnull byval([[STRUCT_LIBCS:%.*]]) align 8 [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void libcS_i32(libcS x, i32 y)
+{
+  vararg(x, y);
+}
+
+
+// CHECK-LABEL: @i32_v4f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(i32 noundef [[X:%.*]], <4 x float> noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void i32_v4f32(i32 x, v4f32 y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @v4f32_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(<4 x float> noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void v4f32_i32(v4f32 x, i32 y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @i32_v8f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(i32 noundef [[X:%.*]], <8 x float> noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void i32_v8f32(i32 x, v8f32 y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @v8f32_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(<8 x float> noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void v8f32_i32(v8f32 x, i32 y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @i32_v16f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(i32 noundef [[X:%.*]], <16 x float> noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void i32_v16f32(i32 x, v16f32 y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @v16f32_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(<16 x float> noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void v16f32_i32(v16f32 x, i32 y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @i32_v32f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca <32 x float>, align 128
+// CHECK-NEXT:    [[Y:%.*]] = load <32 x float>, ptr [[TMP0:%.*]], align 128, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x float> [[Y]], ptr [[INDIRECT_ARG_TEMP]], align 128, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void (...) @vararg(i32 noundef [[X:%.*]], ptr noundef nonnull byval(<32 x float>) align 128 [[INDIRECT_ARG_TEMP]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void i32_v32f32(i32 x, v32f32 y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @v32f32_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca <32 x float>, align 128
+// CHECK-NEXT:    [[X:%.*]] = load <32 x float>, ptr [[TMP0:%.*]], align 128, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x float> [[X]], ptr [[INDIRECT_ARG_TEMP]], align 128, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void (...) @vararg(ptr noundef nonnull byval(<32 x float>) align 128 [[INDIRECT_ARG_TEMP]], i32 noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void v32f32_i32(v32f32 x, i32 y)
+{
+  vararg(x, y);
+}
+
+#if __AMDGPU__ || __NVPTX__
+
+
+void (*volatile vararg_ptr)(...) = &vararg;
+
+void indirect_single_i32(i32 x)
+{
+  vararg_ptr(x);
+}
+
+
+#endif
diff --git a/clang/test/CodeGen/variadic-wrapper-removal.c b/clang/test/CodeGen/variadic-wrapper-removal.c
new file mode 100644
index 00000000000000..13499612f0757f
--- /dev/null
+++ b/clang/test/CodeGen/variadic-wrapper-removal.c
@@ -0,0 +1,85 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu -O1 -emit-llvm -o - %s | opt --passes='module(expand-variadics,inline)' -S | FileCheck %s
+// RUN: %clang_cc1 -triple=x86_64-linux-gnu -O1 -emit-llvm -o - %s | opt --passes='module(expand-variadics,inline)' -S | FileCheck %s
+
+// neither arm arch is implemented yet, leaving it here as a reminder
+// armv6 is a ptr as far as the struct is concerned, but possibly also a [1 x i32] passed by value
+// that seems insistent, maybe leave 32 bit arm alone for now
+// aarch64 is a struct of five things passed using byval memcpy
+
+// R-N: %clang_cc1 -triple=armv6-none--eabi -O1 -emit-llvm -o - %s | opt --passes=expand-variadics -S | FileCheck %s
+// R-N: %clang_cc1 -triple=aarch64-none-linux-gnu -O1 -emit-llvm -o - %s | opt --passes=expand-variadics -S | FileCheck %s
+
+
+
+// expand-variadics rewrites calls to variadic functions into calls to
+// equivalent functions that take a va_list argument. A property of the
+// implementation is that said "equivalent function" may be a pre-existing one.
+// This is equivalent to inlining a sufficiently simple variadic wrapper.
+
+#include <stdarg.h>
+
+typedef int FILE; // close enough for this test
+
+// fprintf is sometimes implemented as a call to vfprintf. That fits the
+// pattern the transform pass recognises - given an implementation of fprintf
+// in the IR module, calls to it can be rewritten into calls into vfprintf.
+
+// CHECK-LABEL: define{{.*}} i32 @fprintf(
+// CHECK-LABEL: define{{.*}} i32 @call_fprintf(
+// CHECK-NOT:   @fprintf
+// CHECK:       @vfprintf
+int vfprintf(FILE *restrict f, const char *restrict fmt, va_list ap);
+int fprintf(FILE *restrict f, const char *restrict fmt, ...)
+{
+  int ret;
+  va_list ap;
+  va_start(ap, fmt);
+  ret = vfprintf(f, fmt, ap);
+  va_end(ap);
+  return ret;
+}
+int call_fprintf(FILE *f)
+{
+  int x = 42;
+  double y = 3.14;
+  return fprintf(f, "int %d dbl %g\n", x, y);
+}
+
+// Void return type is also OK
+
+// CHECK-LABEL: define{{.*}} void @no_result(
+// CHECK-LABEL: define{{.*}} void @call_no_result(
+// CHECK-NOT:   @no_result
+// CHECK:       @vno_result
+void vno_result(const char * fmt, va_list);
+void no_result(const char * fmt, ...)
+{
+  va_list ap;
+  va_start(ap, fmt);
+  vno_result(fmt, ap);
+  va_end(ap);
+}
+void call_no_result(FILE *f)
+{
+  int x = 101;
+  no_result("", x);
+}
+
+// The vaend in the forwarding implementation is optional where it's a no-op
+
+// CHECK-LABEL: define{{.*}} i32 @no_vaend(
+// CHECK-LABEL: define{{.*}} i32 @call_no_vaend(
+// CHECK-NOT:   @no_vaend
+// CHECK:       @vno_vaend
+int vno_vaend(int x, va_list);
+int no_vaend(int x, ...)
+{
+  va_list ap;
+  va_start(ap, x);
+  return vno_vaend(x, ap);
+}
+int call_no_vaend(int x)
+{
+  return no_vaend(x, 10, 2.5f);
+}
diff --git a/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp b/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
new file mode 100644
index 00000000000000..c7b5863b2632a6
--- /dev/null
+++ b/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
@@ -0,0 +1,247 @@
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s --check-prefixes=CHECK,X86Linux
+
+
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s --check-prefixes=CHECK,X64SystemV
+
+
+// RUN: %clang_cc1 -triple i386-apple-darwin -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s --check-prefixes=CHECK,X86Darwin
+
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s --check-prefixes=CHECK,X64SystemV
+
+
+// The clang test suite has _lots_ of windows related triples in it
+// 'x86_64-pc-windows-msvc|i686-windows-msvc|thumbv7-windows|aarch64-windows|i686-windows|x86_64-windows|x86_64-unknown-windows-msvc|i386-windows-pc|x86_64--windows-msvc|i686--windows-msvc|x86_64-unknown-windows-gnu|i686-unknown-windows-msvc|i686-unknown-windows-gnu|arm64ec-pc-windows-msvc19.20.0|i686-pc-windows-msvc19.14.0|i686-pc-windows|x86_64--windows-gnu|i686--windows-gnu|thumbv7--windows|i386-windows|x86_64-unknown-windows-pc|i686--windows|x86_64--windows|i686-w64-windows-gnu'
+
+// Might be detecting an inconsistency - maybe different alignment
+// Presently failing on an unusual calling convention
+
+// i686 windows emits suboptimal codegen. sroa removes a field from a struct which misaligns a field which blocks store/load forwarding
+// RUN: %clang_cc1 -triple i686-windows-msvc -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s --check-prefixes=CHECK,X86Windows
+
+
+// 64 bit windows va_arg passes most type indirectly but the call instruction uses the types by value
+// ___: %clang_cc1 -triple x86_64-pc-windows-msvc -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s --check-prefixes=CHECK
+
+
+
+// amdgpu emits a sequence of addrspace casts that aren't folded yet
+// todo: match it anyway
+// R-N: %clang_cc1 -triple amdgcn-amd-amdhsa -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s
+
+// Requires the instcombine patch that hasn't landed yet
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s
+
+
+
+
+
+// Not yet implemented on arm
+// Also there are various x86 variants that should be in the triple
+
+// Checks for consistency between clang and expand-va-intrinics
+// 1. Use clang to lower va_arg
+// 2. Use expand-variadics to lower the rest of the variadic operations
+// 3. Use opt -O1 to simplify the functions to ret %arg
+// The simplification to ret %arg will fail when the two are not consistent, modulo bugs elsewhere.
+
+#include <stdarg.h>
+
+template <typename X, typename Y>
+static X first(...) {
+  va_list va;
+  __builtin_va_start(va, 0);
+  X r = va_arg(va, X);
+  va_end(va);
+  return r;
+}
+
+template <typename X, typename Y>
+static Y second(...) {
+  va_list va;
+  __builtin_va_start(va, 0);
+  va_arg(va, X);
+  Y r = va_arg(va, Y);
+  va_end(va);
+  return r;
+}
+
+typedef float float4 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef float float8 __attribute__((__vector_size__(32), __aligned__(32)));
+typedef float float16 __attribute__((__vector_size__(64), __aligned__(64)));
+typedef float float32 __attribute__((__vector_size__(128), __aligned__(128)));
+
+
+extern "C"
+{
+// CHECK-LABEL: define{{.*}} i32 @first_i32_i32(i32{{.*}} %x, i32{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %x
+int first_i32_i32(int x, int y)
+{
+  return first<int,int>(x, y);
+}
+
+// CHECK-LABEL: define{{.*}} i32 @second_i32_i32(i32{{.*}} %x, i32{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %y
+int second_i32_i32(int x, int y)
+{
+  return second<int,int>(x, y);
+}
+}
+
+// Permutations of an int and a double
+extern "C"
+{
+// CHECK-LABEL: define{{.*}} i32 @first_i32_f64(i32{{.*}} %x, double{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %x
+int first_i32_f64(int x, double y)
+{
+  return first<int,double>(x, y);
+}
+  
+// CHECK-LABEL: define{{.*}} double @second_i32_f64(i32{{.*}} %x, double{{.*}} %y)
+// CHECK:       entry:
+
+// X86Linux:    ret double %y
+// X64SystemV:  ret double %y
+// X86Darwin:   ret double %y
+// X86Windows:  [[TMP0:%.*]] = alloca <{ [4 x i8], double }>, align 4
+// X86Windows:  [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+// X86Windows:  store double %y, ptr [[TMP1]], align 4
+// X86Windows:  [[TMP2:%.*]] = load double, ptr [[TMP0]], align 4
+// X86Windows:  ret double [[TMP2]]
+double second_i32_f64(int x, double y)
+{
+  return second<int,double>(x, y);
+}
+
+// CHECK-LABEL: define{{.*}} double @first_f64_i32(double{{.*}} %x, i32{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret double %x
+double first_f64_i32(double x, int y)
+{
+  return first<double,int>(x, y);
+}
+
+// CHECK-LABEL: define{{.*}} i32 @second_f64_i32(double{{.*}} %x, i32{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %y
+int second_f64_i32(double x, int y)
+{
+  return second<double,int>(x, y);
+}   
+}
+
+
+// Permutations of an int and a float4
+extern "C"
+{
+
+// CHECK-LABEL: define{{.*}} i32 @first_i32_v4f32(i32{{.*}} %x, ptr{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %x
+int first_i32_v4f32(int x, float4 * y)
+{
+  return first<int,float4>(x, *y);
+}
+  
+// CHECK-LABEL: define{{.*}} void @second_i32_v4f32(i32{{.*}} %x, ptr{{.*}} %y, ptr{{.*}} %r)
+// CHECK:       entry:
+// X86Linux:    [[TMP0:%.*]] = load <4 x float>, ptr %y, align 16
+// X86Linux:    store <4 x float> [[TMP0]], ptr %r, align 16
+// X64SystemV:  [[TMP0:%.*]] = load <4 x float>, ptr %y, align 16
+// X64SystemV:  store <4 x float> [[TMP0]], ptr %r, align 16
+// X86Darwin:   [[TMP0:%.*]] = load <2 x i64>, ptr %y, align 16
+// X86Darwin:   store <2 x i64> [[TMP0]], ptr %r, align 16
+// X86Windows:  [[TMP0:%.*]] = alloca <{ [12 x i8], <4 x float> }>, align 4
+// X86Windows:  [[TMP1:%.*]] = load <4 x float>, ptr %y, align 16
+// X86Windows:  [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+// X86Windows:  store <4 x float> [[TMP1]], ptr [[TMP2]], align 4
+// X86Windows:  [[TMP3:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+// X86Windows:  store <4 x float> [[TMP3]], ptr %r, align 16
+// CHECK:       ret void
+void second_i32_v4f32(int x, float4 * y, float4* r)
+{
+  *r = second<int,float4>(x, *y);
+}
+
+    
+// CHECK-LABEL: define{{.*}} void @first_v4f32_i32(ptr{{.*}} %x, i32{{.*}} %y, ptr{{.*}} %r)
+// CHECK:       entry:
+// X86Linux:    [[TMP0:%.*]] = load <4 x float>, ptr %x, align 16
+// X86Linux:    store <4 x float> [[TMP0]], ptr %r, align 16
+// X64SystemV:  [[TMP0:%.*]] = load <4 x float>, ptr %x, align 16
+// X64SystemV:  store <4 x float> [[TMP0]], ptr %r, align 16
+// X86Darwin:   [[TMP0:%.*]] = load <2 x i64>, ptr %x, align 16
+// X86Darwin:   store <2 x i64> [[TMP0]], ptr %r, align 16
+// CHECK:       ret void
+  void first_v4f32_i32(float4* x, int y, float4* r)
+{
+ *r =first<float4,int>(*x, y);
+}
+
+// CHECK-LABEL: define{{.*}} i32 @second_v4f32_i32(ptr{{.*}} %x, i32{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %y
+int second_v4f32_i32(float4* x, int y)
+{
+  return second<float4,int>(*x, y);
+}
+
+}
+
+// A large struct with awkwardly aligned fields
+
+typedef struct {
+  char c;
+  short s;
+  int i;
+  long l;
+  float f;
+  double d;
+} libcS;
+
+extern "C"
+{
+
+// CHECK-LABEL: define{{.*}} i32 @first_i32_libcS(i32{{.*}} %x, ptr{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %x
+int first_i32_libcS(int x, libcS * y)
+{
+  return first<int,libcS>(x, *y);
+}
+  
+// CHECK-LABEL: define{{.*}} void @second_i32_libcS(i32{{.*}} %x, ptr{{.*}} %y, ptr{{.*}} %r)
+// CHECK:       entry:
+// CHECK:       ret void
+void second_i32_libcS(int x, libcS * y, libcS* r)
+{
+  *r = second<int,libcS>(x, *y);
+}
+
+    
+// CHECK-LABEL: define{{.*}} void @first_libcS_i32(ptr{{.*}} %x, i32{{.*}} %y, ptr{{.*}} %r)
+// CHECK:       entry:
+
+  void first_libcS_i32(libcS* x, int y, libcS* r)
+{
+ *r =first<libcS,int>(*x, y);
+}
+
+// CHECK-LABEL: define{{.*}} i32 @second_libcS_i32(ptr{{.*}} %x, i32{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %y
+int second_libcS_i32(libcS* x, int y)
+{
+  return second<libcS,int>(*x, y);
+}
+
+  
+}
+
+
+
+            
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 9ba75d491c1c9c..5da681781da975 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -106,6 +106,7 @@ void initializeExpandLargeDivRemLegacyPassPass(PassRegistry&);
 void initializeExpandMemCmpLegacyPassPass(PassRegistry &);
 void initializeExpandPostRAPass(PassRegistry&);
 void initializeExpandReductionsPass(PassRegistry&);
+void initializeExpandVariadicsPass(PassRegistry &);
 void initializeExpandVectorPredicationPass(PassRegistry &);
 void initializeExternalAAWrapperPassPass(PassRegistry&);
 void initializeFEntryInserterPass(PassRegistry&);
diff --git a/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h b/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h
new file mode 100644
index 00000000000000..67fa746813ea0e
--- /dev/null
+++ b/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h
@@ -0,0 +1,43 @@
+//===- ExpandVariadics.h - expand variadic functions ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_IPO_EXPANDVARIADICS_H
+#define LLVM_TRANSFORMS_IPO_EXPANDVARIADICS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Module;
+class ModulePass;
+class OptimizationLevel;
+
+enum class ExpandVariadicsMode {
+  unspecified,
+  disable,
+  optimize,
+  lowering,
+};
+
+class ExpandVariadicsPass : public PassInfoMixin<ExpandVariadicsPass> {
+  const ExpandVariadicsMode ConstructedMode;
+
+public:
+  // Operates under passed mode unless overridden on commandline
+  ExpandVariadicsPass(ExpandVariadicsMode ConstructedMode);
+
+  // Chooses disable or optimize based on optimization level
+  ExpandVariadicsPass(OptimizationLevel Level);
+
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+ModulePass *createExpandVariadicsPass(ExpandVariadicsMode);
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_IPO_EXPANDVARIADICS_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 8d408ca2363a98..ada05770461151 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -135,6 +135,7 @@
 #include "llvm/Transforms/IPO/DeadArgumentElimination.h"
 #include "llvm/Transforms/IPO/ElimAvailExtern.h"
 #include "llvm/Transforms/IPO/EmbedBitcodePass.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionImport.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 3bb2ce0ae3460b..b1a306e15fa3c0 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Transforms/IPO/DeadArgumentElimination.h"
 #include "llvm/Transforms/IPO/ElimAvailExtern.h"
 #include "llvm/Transforms/IPO/EmbedBitcodePass.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/GlobalDCE.h"
@@ -1172,6 +1173,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   if (EnablePGOForceFunctionAttrs)
     MPM.addPass(PGOForceFunctionAttrsPass(PGOOpt->ColdOptType));
 
+  // ExpandVariadics interacts well with the function inliner.
+  MPM.addPass(ExpandVariadicsPass(Level));
+
   MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true));
 
   if (EnableModuleInliner)
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 2fbc7f7d88ba39..d7d29383f9846d 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -59,6 +59,7 @@ MODULE_PASS("dot-callgraph", CallGraphDOTPrinterPass())
 MODULE_PASS("dxil-upgrade", DXILUpgradePass())
 MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass())
 MODULE_PASS("extract-blocks", BlockExtractorPass({}, false))
+MODULE_PASS("expand-variadics", ExpandVariadicsPass(OptimizationLevel::O0))
 MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
 MODULE_PASS("function-import", FunctionImportPass())
 MODULE_PASS("globalopt", GlobalOptPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 90f36fadf35903..70d634edc35e11 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -24,6 +24,7 @@ MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
 MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
+MODULE_PASS("expand-variadics", ExpandVariadicsPass(ExpandVariadicsMode::lowering))
 #undef MODULE_PASS
 
 #ifndef FUNCTION_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f7e552177d6f50..99ae8a17134a78 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -54,6 +54,7 @@
 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/Scalar.h"
@@ -967,6 +968,8 @@ void AMDGPUPassConfig::addIRPasses() {
   if (isPassEnabled(EnableImageIntrinsicOptimizer))
     addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
 
+  addPass(createExpandVariadicsPass(ExpandVariadicsMode::lowering));
+
   // Function calls are not supported, so make sure we inline everything.
   addPass(createAMDGPUAlwaysInlinePass());
   addPass(createAlwaysInlinerLegacyPass());
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 78f48652c9920f..e5db0114592877 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
@@ -323,6 +324,9 @@ void NVPTXPassConfig::addIRPasses() {
       AAR.addAAResult(WrapperPass->getResult());
   }));
 
+  // Should run before anything (else!) that adjusts calling conventions
+  addPass(createExpandVariadicsPass(ExpandVariadicsMode::lowering));
+
   // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
   // it here does nothing.  But since we need it for correctness when lowering
   // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
index 5fbdbc3a014f9a..92a9697720efd4 100644
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -12,6 +12,7 @@ add_llvm_component_library(LLVMipo
   DeadArgumentElimination.cpp
   ElimAvailExtern.cpp
   EmbedBitcodePass.cpp
+  ExpandVariadics.cpp
   ExtractGV.cpp
   ForceFunctionAttrs.cpp
   FunctionAttrs.cpp
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
new file mode 100644
index 00000000000000..e85f91e4060c49
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -0,0 +1,1056 @@
+//===-- ExpandVariadicsPass.cpp --------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an optimization pass for variadic functions. If called from codegen,
+// it can serve as the implementation of variadic functions for a given target.
+//
+// The strategy is to turn the ... part of a varidic function into a va_list
+// and fix up the call sites. This is completely effective if the calling
+// convention can declare that to be the right thing, e.g. on GPUs or where
+// the application is wholly statically linked. In the usual case, it will
+// replace known calls to known variadic functions with calls that are amenable
+// to inlining and other optimisations.
+//
+// The target-dependent parts are in class VariadicABIInfo. Enabling a new
+// target means adding a case to VariadicABIInfo::create() along with tests.
+// This will be especially simple if the va_list representation is a char*.
+//
+// The majority of the plumbing is splitting the variadic function into a
+// single basic block that packs the variadic arguments into a va_list and
+// a second function that does the work of the original. The target specific
+// part is packing arguments into a contiguous buffer that the clang expansion
+// of va_arg will do the right thing with.
+//
+// The aggregate effect is to unblock other transforms, most critically the
+// general purpose inliner. Known calls to variadic functions become zero cost.
+//
+// Consistency with clang is primarily tested by emitting va_arg using clang
+// then expanding the variadic functions using this pass, followed by trying
+// to constant fold the functions to no-ops.
+//
+// Target specific behaviour is tested in IR - mainly checking that values are
+// put into positions in call frames that make sense for that particular target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/TargetParser/Triple.h"
+
+#include <cstdio>
+
+#define DEBUG_TYPE "expand-variadics"
+
+using namespace llvm;
+
+cl::opt<ExpandVariadicsMode> ExpandVariadicsModeOption(
+    DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE),
+    cl::init(ExpandVariadicsMode::unspecified),
+    cl::values(clEnumValN(ExpandVariadicsMode::unspecified, "unspecified",
+                          "Use the implementation defaults"),
+               clEnumValN(ExpandVariadicsMode::disable, "disable",
+                          "Disable the pass entirely"),
+               clEnumValN(ExpandVariadicsMode::optimize, "optimize",
+                          "Optimise without changing ABI"),
+               clEnumValN(ExpandVariadicsMode::lowering, "lowering",
+                          "Change variadic calling convention")));
+
+namespace {
+
+// Module implements getFunction() which returns nullptr on missing declaration
+// and getOrInsertFunction which creates one when absent. Intrinsics.h
+// implements getDeclaration which creates one when missing. This should be
+// changed to be consistent with Module()'s naming. Implementing as a local
+// function here in the meantime to decouple from that process.
+Function *getPreexistingDeclaration(Module *M, Intrinsic::ID id,
+                                    ArrayRef<Type *> Tys = std::nullopt) {
+  auto *FT = Intrinsic::getType(M->getContext(), id, Tys);
+  return M->getFunction(Tys.empty() ? Intrinsic::getName(id)
+                                    : Intrinsic::getName(id, Tys, M, FT));
+}
+
+// Lots of targets use a void* pointed at a buffer for va_list.
+// Some use more complicated iterator constructs. Type erase that
+// so the rest of the pass can operation on either.
+// Virtual functions where different targets want different behaviour,
+// normal where all implemented targets presently have the same.
+struct VAListInterface {
+  virtual ~VAListInterface() {}
+
+  // Whether a valist instance is passed by value or by address
+  // I.e. does it need to be alloca'ed and stored into, or can
+  // it be passed directly in a SSA register
+  virtual bool passedInSSARegister() = 0;
+
+  // The type of a va_list iterator object
+  virtual Type *vaListType(LLVMContext &Ctx) = 0;
+
+  // The type of a va_list as a function argument as lowered by C
+  virtual Type *vaListParameterType(Module &M) = 0;
+
+  // Initialise an allocated va_list object to point to an already
+  // initialised contiguous memory region.
+  // Return the value to pass as the va_list argument
+  virtual Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+                                  AllocaInst *, Value * /*buffer*/) = 0;
+
+  // Simple lowering suffices for va_end, va_copy for current targets
+  bool vaEndIsNop() { return true; }
+  bool vaCopyIsMemcpy() { return true; }
+};
+
+// The majority case - a void* into an alloca
+struct VoidPtr final : public VAListInterface {
+  bool passedInSSARegister() override { return true; }
+
+  Type *vaListType(LLVMContext &Ctx) override {
+    return PointerType::getUnqual(Ctx);
+  }
+
+  Type *vaListParameterType(Module &M) override {
+    return PointerType::getUnqual(M.getContext());
+  }
+
+  Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+                          AllocaInst * /*va_list*/, Value *buffer) override {
+    return buffer;
+  }
+};
+
+struct VoidPtrAllocaAddrspace final : public VAListInterface {
+
+  bool passedInSSARegister() override { return true; }
+
+  Type *vaListType(LLVMContext &Ctx) override {
+    return PointerType::getUnqual(Ctx);
+  }
+
+  Type *vaListParameterType(Module &M) override {
+    const DataLayout &DL = M.getDataLayout();
+    return DL.getAllocaPtrType(M.getContext());
+  }
+
+  Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+                          AllocaInst * /*va_list*/, Value *buffer) override {
+    return buffer;
+  }
+};
+
+// SystemV as used by X64 Linux and others
+struct SystemV final : public VAListInterface {
+  bool passedInSSARegister() override { return false; }
+
+  Type *vaListType(LLVMContext &Ctx) override {
+    auto I32 = Type::getInt32Ty(Ctx);
+    auto Ptr = PointerType::getUnqual(Ctx);
+    return ArrayType::get(StructType::get(Ctx, {I32, I32, Ptr, Ptr}), 1);
+  }
+
+  Type *vaListParameterType(Module &M) override {
+    return PointerType::getUnqual(M.getContext());
+  }
+
+  Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+                          AllocaInst *VaList, Value *VoidBuffer) override {
+    assert(VaList->getAllocatedType() == vaListType(Ctx));
+
+    Type *VaListTy = vaListType(Ctx);
+
+    Type *I32 = Type::getInt32Ty(Ctx);
+    Type *I64 = Type::getInt64Ty(Ctx);
+
+    Value *Idxs[3] = {
+        ConstantInt::get(I64, 0),
+        ConstantInt::get(I32, 0),
+        nullptr,
+    };
+
+    Idxs[2] = ConstantInt::get(I32, 0);
+    Builder.CreateStore(
+        ConstantInt::get(I32, 48),
+        Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "gp_offset"));
+
+    Idxs[2] = ConstantInt::get(I32, 1);
+    Builder.CreateStore(
+        ConstantInt::get(I32, 6 * 8 + 8 * 16),
+        Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "fp_offset"));
+
+    Idxs[2] = ConstantInt::get(I32, 2);
+    Builder.CreateStore(
+        VoidBuffer,
+        Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "overfow_arg_area"));
+
+    Idxs[2] = ConstantInt::get(I32, 3);
+    Builder.CreateStore(
+        ConstantPointerNull::get(PointerType::getUnqual(Ctx)),
+        Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "reg_save_area"));
+
+    return VaList;
+  }
+};
+
+class VariadicABIInfo {
+
+  VariadicABIInfo(uint32_t MinAlign, uint32_t MaxAlign,
+                  std::unique_ptr<VAListInterface> VAList)
+      : MinAlign(MinAlign), MaxAlign(MaxAlign), VAList(std::move(VAList)) {}
+
+  template <typename T>
+  static VariadicABIInfo create(uint32_t MinAlign, uint32_t MaxAlign) {
+    return {MinAlign, MaxAlign, std::make_unique<T>()};
+  }
+
+public:
+  const uint32_t MinAlign;
+  const uint32_t MaxAlign;
+  std::unique_ptr<VAListInterface> VAList;
+
+  VariadicABIInfo() : VariadicABIInfo(0, 0, nullptr) {}
+  explicit operator bool() const { return static_cast<bool>(VAList); }
+
+  VariadicABIInfo(VariadicABIInfo &&Self)
+      : MinAlign(Self.MinAlign), MaxAlign(Self.MaxAlign),
+        VAList(Self.VAList.release()) {}
+
+  VariadicABIInfo &operator=(VariadicABIInfo &&Other) {
+    this->~VariadicABIInfo();
+    new (this) VariadicABIInfo(std::move(Other));
+    return *this;
+  }
+
+  static VariadicABIInfo create(llvm::Triple const &Triple) {
+    const bool IsLinuxABI = Triple.isOSLinux() || Triple.isOSCygMing();
+
+    switch (Triple.getArch()) {
+
+    case Triple::r600:
+    case Triple::amdgcn: {
+      return create<VoidPtrAllocaAddrspace>(1, 0);
+    }
+
+    case Triple::nvptx:
+    case Triple::nvptx64: {
+      return create<VoidPtr>(4, 0);
+    }
+
+    case Triple::x86: {
+      // These seem to all fall out the same, despite getTypeStackAlign
+      // implying otherwise.
+
+      if (Triple.isOSDarwin()) {
+        // X86_32ABIInfo::getTypeStackAlignInBytes is misleading for this.
+        // The slotSize(4) implies a minimum alignment
+        // The AllowHigherAlign = true means there is no maximum alignment.
+
+        return create<VoidPtr>(4, 0);
+      }
+      if (Triple.getOS() == llvm::Triple::Win32) {
+        return create<VoidPtr>(4, 0);
+      }
+
+      if (IsLinuxABI) {
+        return create<VoidPtr>(4, 0);
+      }
+
+      break;
+    }
+
+    case Triple::x86_64: {
+      if (Triple.isWindowsMSVCEnvironment() || Triple.isOSWindows()) {
+        // x64 msvc emit vaarg passes > 8 byte values by pointer
+        // however the variadic call instruction created does not, e.g.
+        // a <4 x f32> will be passed as itself, not as a pointer or byval.
+        // Postponing resolution of that for now.
+        // Expected min/max align of 8.
+        return {};
+      }
+
+      // SystemV X64 documented behaviour:
+      // Slots are at least eight byte aligned and at most 16 byte aligned.
+      // If the type needs more than sixteen byte alignment, it still only gets
+      // that much alignment on the stack.
+      // X64 behaviour in clang:
+      // Slots are at least eight byte aligned and at most naturally aligned
+      // This matches clang, not the ABI docs.
+
+      if (Triple.isOSDarwin()) {
+        return create<SystemV>(8, 8);
+      }
+
+      if (IsLinuxABI) {
+        return create<SystemV>(8, 8);
+      }
+
+      break;
+    }
+
+    default:
+      break;
+    }
+
+    return {};
+  }
+};
+
+class ExpandVariadics : public ModulePass {
+
+  // The pass construction sets the default (optimize when called from middle
+  // end, lowering when called from the backend). The command line variable
+  // overrides that. This is useful for testing and debugging. It also allows
+  // building an applications with variadic functions wholly removed if one
+  // has sufficient control over the dependencies, e.g. a statically linked
+  // clang that has no variadic function calls remaining in the binary.
+  static ExpandVariadicsMode
+  withCommandLineOverride(ExpandVariadicsMode LLVMRequested) {
+    ExpandVariadicsMode UserRequested = ExpandVariadicsModeOption;
+    return (UserRequested == ExpandVariadicsMode::unspecified) ? LLVMRequested
+                                                               : UserRequested;
+  }
+
+public:
+  static char ID;
+  const ExpandVariadicsMode Mode;
+  VariadicABIInfo ABI;
+
+  ExpandVariadics(ExpandVariadicsMode Mode)
+      : ModulePass(ID), Mode(withCommandLineOverride(Mode)) {}
+  StringRef getPassName() const override { return "Expand variadic functions"; }
+
+  // Rewrite a variadic call site
+  bool expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, FunctionType *,
+                  Function *NF);
+
+  // Given a variadic function, return a function taking a va_list that can be
+  // called instead of the original. Mutates F.
+  Function *deriveInlinableVariadicFunctionPair(Module &M, IRBuilder<> &Builder,
+                                                Function &F);
+
+  bool runOnFunction(Module &M, IRBuilder<> &Builder, Function *F);
+
+  // Entry point
+  bool runOnModule(Module &M) override;
+
+  bool rewriteABI() { return Mode == ExpandVariadicsMode::lowering; }
+
+  void memcpyVAListPointers(const DataLayout &DL, IRBuilder<> &Builder,
+                            Value *Dst, Value *Src) {
+    auto &Ctx = Builder.getContext();
+    Type *VaListTy = ABI.VAList->vaListType(Ctx);
+    uint64_t Size = DL.getTypeAllocSize(VaListTy).getFixedValue();
+    // todo: on amdgcn this should be in terms of addrspace 5
+    Builder.CreateMemCpyInline(Dst, {}, Src, {},
+                               ConstantInt::get(Type::getInt32Ty(Ctx), Size));
+  }
+
+  bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL,
+                             VAStartInst *Inst);
+
+  bool expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &,
+                             VAEndInst *Inst);
+
+  bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL,
+                             VACopyInst *Inst);
+
+  template <Intrinsic::ID ID, typename InstructionType>
+  bool expandIntrinsicUsers(Module &M, IRBuilder<> &Builder,
+                            PointerType *ArgType) {
+    bool Changed = false;
+    const DataLayout &DL = M.getDataLayout();
+    if (Function *Intrinsic = getPreexistingDeclaration(&M, ID, {ArgType})) {
+      for (User *U : Intrinsic->users()) {
+        if (auto *I = dyn_cast<InstructionType>(U)) {
+          Changed |= expandVAIntrinsicCall(Builder, DL, I);
+        }
+      }
+      if (Intrinsic->use_empty())
+        Intrinsic->eraseFromParent();
+    }
+    return Changed;
+  }
+
+  FunctionType *inlinableVariadicFunctionType(Module &M, FunctionType *FTy) {
+    SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end());
+    ArgTypes.push_back(ABI.VAList->vaListParameterType(M));
+    return FunctionType::get(FTy->getReturnType(), ArgTypes,
+                             /*IsVarArgs*/ false);
+  }
+
+  static ConstantInt *sizeOfAlloca(LLVMContext &Ctx, const DataLayout &DL,
+                                   AllocaInst *Alloced) {
+    Type *AllocaType = Alloced->getAllocatedType();
+    TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
+    uint64_t AsInt = AllocaTypeSize.getFixedValue();
+    return ConstantInt::get(Type::getInt64Ty(Ctx), AsInt);
+  }
+
+  static SmallSet<unsigned, 2> supportedAddressSpaces(const DataLayout &DL) {
+    // FIXME: It looks like a module can contain arbitrary integers for address
+    // spaces in which case we might need to check _lots_ of cases. Maybe add a
+    // rule to the verifier that the vastart/vaend intrinsics can have arguments
+    // in 0 or in allocaaddrspace but nowhere else
+    SmallSet<unsigned, 2> Set;
+    Set.insert(0); // things tend to end up in zero
+    Set.insert(
+        DL.getAllocaAddrSpace()); // the argument should be in alloca addrspace
+    return Set;
+  }
+
+  // this could be partially target specific
+  bool expansionApplicableToFunction(Module &M, Function *F) {
+    if (F->isIntrinsic() || !F->isVarArg() ||
+        F->hasFnAttribute(Attribute::Naked)) {
+      return false;
+    }
+
+    // TODO: work out what to do with the cs_chain functions documented as
+    // non-variadic that are variadic in some lit tests
+    if (F->getCallingConv() != CallingConv::C)
+      return false;
+
+    if (!rewriteABI()) {
+      // e.g. can't replace a weak function unless changing the original symbol
+      if (GlobalValue::isInterposableLinkage(F->getLinkage())) {
+        return false;
+      }
+    }
+
+    if (!rewriteABI()) {
+      // If optimising, err on the side of leaving things alone
+      for (const Use &U : F->uses()) {
+        const auto *CB = dyn_cast<CallBase>(U.getUser());
+
+        if (!CB)
+          return false;
+
+        if (CB->isMustTailCall())
+          return false;
+
+        if (!CB->isCallee(&U) ||
+            CB->getFunctionType() != F->getFunctionType()) {
+          return false;
+        }
+      }
+    }
+
+    // Branch funnels look like variadic functions but aren't:
+    //
+    // define hidden void @__typeid_typeid1_0_branch_funnel(ptr nest %0, ...) {
+    //  musttail call void (...) @llvm.icall.branch.funnel(ptr %0, ptr @vt1_1,
+    //  ptr @vf1_1, ...) ret void
+    // }
+    //
+    // %1 = call i32 @__typeid_typeid1_0_branch_funnel(ptr nest %vtable, ptr
+    // %obj, i32 1)
+    //
+    // If this function contains a branch funnel intrinsic, don't transform it.
+
+    if (Function *Funnel =
+            getPreexistingDeclaration(&M, Intrinsic::icall_branch_funnel)) {
+      for (const User *U : Funnel->users()) {
+        if (auto *I = dyn_cast<CallBase>(U)) {
+          if (F == I->getFunction()) {
+            return false;
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+
+  bool callinstRewritable(CallBase *CB) {
+    if (CallInst *CI = dyn_cast<CallInst>(CB)) {
+      if (CI->isMustTailCall()) {
+        // Cannot expand musttail calls
+        if (rewriteABI()) {
+          // Todo: Sema?
+          report_fatal_error("Cannot lower musttail variadic call");
+        } else {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+  class ExpandedCallFrame {
+    // Helper for constructing an alloca instance containing the arguments bound
+    // to the variadic ... parameter, rearranged to allow indexing through a
+    // va_list iterator
+    //
+    // The awkward memory layout is to allow direct access to a contiguous array
+    // of types for the conversion to a struct type
+    enum { N = 4 };
+    SmallVector<Type *, N> FieldTypes;
+    enum Tag { IsByVal, NotByVal, Padding };
+    SmallVector<std::pair<Value *, Tag>, N> Fields;
+
+    template <Tag tag> void append(Type *T, Value *V) {
+      FieldTypes.push_back(T);
+      Fields.push_back({V, tag});
+    }
+
+  public:
+    void value(Type *T, Value *V) { append<NotByVal>(T, V); }
+
+    void byVal(Type *T, Value *V) { append<IsByVal>(T, V); }
+
+    void padding(LLVMContext &Ctx, uint64_t By) {
+      append<Padding>(ArrayType::get(Type::getInt8Ty(Ctx), By), nullptr);
+    }
+
+    size_t size() const { return FieldTypes.size(); }
+    bool empty() const { return FieldTypes.empty(); }
+
+    StructType *asStruct(LLVMContext &Ctx, StringRef Name) {
+      const bool IsPacked = true;
+      return StructType::create(Ctx, FieldTypes,
+                                (Twine(Name) + ".vararg").str(), IsPacked);
+    }
+
+    void initialiseStructAlloca(const DataLayout &DL, IRBuilder<> &Builder,
+                                AllocaInst *Alloced) {
+
+      StructType *VarargsTy = cast<StructType>(Alloced->getAllocatedType());
+
+      for (size_t I = 0; I < size(); I++) {
+        auto [V, tag] = Fields[I];
+        if (!V)
+          continue;
+
+        auto R = Builder.CreateStructGEP(VarargsTy, Alloced, I);
+        if (tag == IsByVal) {
+          Type *ByValType = FieldTypes[I];
+          Builder.CreateMemCpy(R, {}, V, {},
+                               DL.getTypeAllocSize(ByValType).getFixedValue());
+        } else {
+          Builder.CreateStore(V, R);
+        }
+      }
+    }
+  };
+};
+
+bool ExpandVariadics::runOnModule(Module &M) {
+  bool Changed = false;
+  if (Mode == ExpandVariadicsMode::disable)
+    return Changed;
+
+  llvm::Triple Triple(M.getTargetTriple());
+
+  if (Triple.getArch() == Triple::UnknownArch) {
+    // If we don't know the triple, we can't lower varargs
+    return false;
+  }
+
+  ABI = VariadicABIInfo::create(Triple);
+  if (!ABI) {
+    if (Mode == ExpandVariadicsMode::lowering) {
+      report_fatal_error(
+          "Requested variadic lowering is unimplemented on this target");
+    }
+    return Changed;
+  }
+
+  const DataLayout &DL = M.getDataLayout();
+  auto &Ctx = M.getContext();
+  IRBuilder<> Builder(Ctx);
+
+  // At pass input, va_start intrinsics only occur in variadic functions, as
+  // checked by the IR verifier.
+
+  // The lowering pass needs to run on all variadic functions.
+  // The optimise could run on only those that call va_start
+  // in exchange for additional book keeping to avoid transforming
+  // the same function multiple times when it contains multiple va_start.
+  // Leaving that compile time optimisation for a later patch.
+  for (Function &F : llvm::make_early_inc_range(M))
+    Changed |= runOnFunction(M, Builder, &F);
+
+  // After runOnFunction, all known calls to known variadic functions have been
+  // replaced. va_start intrinsics are presently (and invalidly!) only present
+  // in functions thart used to be variadic and have now been mutated to take a
+  // va_list instead. If lowering as opposed to optimising, calls to unknown
+  // variadic functions have also been replaced.
+
+  // Warning: Intrinsics acting on other ones are missed
+  auto CandidateAddressSpaces = supportedAddressSpaces(DL);
+
+  for (unsigned Addrspace : CandidateAddressSpaces) {
+    PointerType *ArgType = PointerType::get(Ctx, Addrspace);
+    Changed |= expandIntrinsicUsers<Intrinsic::vastart, VAStartInst>(M, Builder,
+                                                                     ArgType);
+    Changed |=
+        expandIntrinsicUsers<Intrinsic::vaend, VAEndInst>(M, Builder, ArgType);
+    Changed |= expandIntrinsicUsers<Intrinsic::vacopy, VACopyInst>(M, Builder,
+                                                                   ArgType);
+  }
+
+  // Variadic intrinsics are now gone. The va_start have been replaced with the
+  // equivalent of a va_copy from the newly appended va_list argument, va_end
+  // and va_copy are removed. All that remains is for the lowering pass to find
+  // indirect calls and rewrite those as well.
+
+  if (Mode == ExpandVariadicsMode::lowering) {
+    for (Function &F : llvm::make_early_inc_range(M)) {
+      if (F.isDeclaration())
+        continue;
+
+      // Now need to track down indirect calls. Can't find those
+      // by walking uses of variadic functions, need to crawl the instruction
+      // stream. Fortunately this is only necessary for the ABI rewrite case.
+      for (BasicBlock &BB : F) {
+        for (Instruction &I : llvm::make_early_inc_range(BB)) {
+          if (CallBase *CB = dyn_cast<CallBase>(&I)) {
+            if (CB->isIndirectCall()) {
+              FunctionType *FTy = CB->getFunctionType();
+              if (FTy->isVarArg()) {
+                Changed |= expandCall(M, Builder, CB, FTy, 0);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder,
+                                    Function *F) {
+  bool Changed = false;
+
+  // fprintf(stderr, "Called runOn: %s\n", F->getName().str().c_str());
+
+  // This check might be too coarse - there are probably cases where
+  // splitting a function is bad but it's usable without splitting
+  if (!expansionApplicableToFunction(M, F))
+    return false;
+
+  // TODO: Leave "thunk" attribute functions alone?
+
+  // Need more tests than this. Weak etc. Some are in expansionApplicable.
+  if (F->isDeclaration() && !rewriteABI()) {
+    return false;
+  }
+
+  // TODO: Is the lazy construction here still useful?
+  Function *Equivalent = deriveInlinableVariadicFunctionPair(M, Builder, *F);
+
+  for (User *U : llvm::make_early_inc_range(F->users())) {
+    // TODO: A test where the call instruction takes a variadic function as
+    // a parameter other than the one it is calling
+    if (CallBase *CB = dyn_cast<CallBase>(U)) {
+      Value *calledOperand = CB->getCalledOperand();
+      if (F == calledOperand) {
+        Changed |= expandCall(M, Builder, CB, F->getFunctionType(), Equivalent);
+      }
+    }
+  }
+
+  if (rewriteABI()) {
+    // No direct calls remain to F, remaining uses are things like address
+    // escaping, modulo errors in this implementation.
+    for (User *U : llvm::make_early_inc_range(F->users()))
+      if (CallBase *CB = dyn_cast<CallBase>(U)) {
+        Value *calledOperand = CB->getCalledOperand();
+        if (F == calledOperand) {
+          report_fatal_error(
+              "ExpandVA abi requires eliminating call uses first\n");
+        }
+      }
+
+    Changed = true;
+    // Converting the original variadic function in-place into the equivalent
+    // one.
+    Equivalent->setLinkage(F->getLinkage());
+    Equivalent->setVisibility(F->getVisibility());
+    Equivalent->takeName(F);
+
+    // Indirect calls still need to be patched up
+    // DAE bitcasts it, todo: check block addresses
+    F->replaceAllUsesWith(Equivalent);
+    F->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+Function *ExpandVariadics::deriveInlinableVariadicFunctionPair(
+    Module &M, IRBuilder<> &Builder, Function &F) {
+  // The purpose here is split the variadic function F into two functions
+  // One is a variadic function that bundles the passed argument into a va_list
+  // and passes it to the second function. The second function does whatever
+  // the original F does, except that it takes a va_list instead of the ...
+
+  assert(expansionApplicableToFunction(M, &F));
+
+  auto &Ctx = M.getContext();
+  const DataLayout &DL = M.getDataLayout();
+
+  // Returned value isDeclaration() is equal to F.isDeclaration()
+  // but that invariant is not satisfied throughout this function
+  const bool FunctionIsDefinition = !F.isDeclaration();
+
+  FunctionType *FTy = F.getFunctionType();
+  SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end());
+  ArgTypes.push_back(ABI.VAList->vaListParameterType(M));
+
+  FunctionType *NFTy = inlinableVariadicFunctionType(M, F.getFunctionType());
+  Function *NF = Function::Create(NFTy, F.getLinkage(), F.getAddressSpace());
+
+  // Note - same attribute handling as DeadArgumentElimination
+  NF->copyAttributesFrom(&F);
+  NF->setComdat(F.getComdat()); // beware weak
+  F.getParent()->getFunctionList().insert(F.getIterator(), NF);
+  NF->setName(F.getName() + ".valist");
+  NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
+
+  // New function is default visibility and internal
+  // Need to set visibility before linkage to avoid an assert in setVisibility
+  NF->setVisibility(GlobalValue::DefaultVisibility);
+  NF->setLinkage(GlobalValue::InternalLinkage);
+
+  AttrBuilder ParamAttrs(Ctx);
+  ParamAttrs.addAttribute(Attribute::NoAlias);
+
+  // TODO: When can the va_list argument have addAlignmentAttr called on it?
+  // It improves codegen lot in the non-inlined case. Probably target
+  // specific.
+
+  AttributeList Attrs = NF->getAttributes();
+  Attrs = Attrs.addParamAttributes(Ctx, NFTy->getNumParams() - 1, ParamAttrs);
+  NF->setAttributes(Attrs);
+
+  // Splice the implementation into the new function with minimal changes
+  if (FunctionIsDefinition) {
+    NF->splice(NF->begin(), &F);
+
+    auto NewArg = NF->arg_begin();
+    for (Argument &Arg : F.args()) {
+      Arg.replaceAllUsesWith(NewArg);
+      NewArg->setName(Arg.getName()); // takeName without killing the old one
+      ++NewArg;
+    }
+    NewArg->setName("varargs");
+  }
+
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  F.getAllMetadata(MDs);
+  for (auto [KindID, Node] : MDs)
+    NF->addMetadata(KindID, *Node);
+
+  if (FunctionIsDefinition) {
+    // The blocks have been stolen so it's now a declaration
+    assert(F.isDeclaration());
+    Type *VaListTy = ABI.VAList->vaListType(Ctx);
+
+    auto *BB = BasicBlock::Create(Ctx, "entry", &F);
+    Builder.SetInsertPoint(BB);
+
+    Value *VaListInstance = Builder.CreateAlloca(VaListTy, nullptr, "va_list");
+
+    Builder.CreateIntrinsic(Intrinsic::vastart, {DL.getAllocaPtrType(Ctx)},
+                            {VaListInstance});
+
+    SmallVector<Value *> Args;
+    for (Argument &A : F.args())
+      Args.push_back(&A);
+
+    // Shall we put the extra arg in alloca addrspace? Probably yes
+    VaListInstance = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        VaListInstance, ABI.VAList->vaListParameterType(M));
+    Args.push_back(VaListInstance);
+
+    CallInst *Result = Builder.CreateCall(NF, Args);
+    Result->setTailCallKind(CallInst::TCK_Tail);
+
+    assert(ABI.VAList->vaEndIsNop()); // If this changes, insert a va_end here
+
+    if (Result->getType()->isVoidTy())
+      Builder.CreateRetVoid();
+    else
+      Builder.CreateRet(Result);
+  }
+
+  assert(F.isDeclaration() == NF->isDeclaration());
+
+  return NF;
+}
+
+bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
+                                 FunctionType *VarargFunctionType,
+                                 Function *NF) {
+  bool Changed = false;
+  const DataLayout &DL = M.getDataLayout();
+
+  if (!callinstRewritable(CB)) {
+    return Changed;
+  }
+
+  // This is something of a problem because the call instructions' idea of the
+  // function type doesn't necessarily match reality, before or after this
+  // pass
+  // Since the plan here is to build a new instruction there is no
+  // particular benefit to trying to preserve an incorrect initial type
+  // If the types don't match and we aren't changing ABI, leave it alone
+  // in case someone is deliberately doing dubious type punning through a
+  // varargs.
+  FunctionType *FuncType = CB->getFunctionType();
+  if (FuncType != VarargFunctionType) {
+    if (!rewriteABI()) {
+      return Changed;
+    }
+    FuncType = VarargFunctionType;
+  }
+
+  auto &Ctx = CB->getContext();
+
+  // Align the struct on ABI.MinAlign to start with
+  Align MaxFieldAlign(ABI.MinAlign ? ABI.MinAlign : 1);
+
+  // The strategy here is to allocate a call frame containing the variadic
+  // arguments laid out such that a target specific va_list can be initialised
+  // with it, such that target specific va_arg instructions will correctly
+  // iterate over it. Primarily this means getting the alignment right.
+
+  ExpandedCallFrame Frame;
+
+  uint64_t CurrentOffset = 0;
+  for (unsigned I = FuncType->getNumParams(), E = CB->arg_size(); I < E; ++I) {
+    Value *ArgVal = CB->getArgOperand(I);
+    bool IsByVal = CB->paramHasAttr(I, Attribute::ByVal);
+    Type *ArgType = IsByVal ? CB->getParamByValType(I) : ArgVal->getType();
+    Align DataAlign = DL.getABITypeAlign(ArgType);
+
+    uint64_t DataAlignV = DataAlign.value();
+
+    // Currently using 0 as a sentinel to mean ignored
+    if (ABI.MinAlign && DataAlignV < ABI.MinAlign)
+      DataAlignV = ABI.MinAlign;
+    if (ABI.MaxAlign && DataAlignV > ABI.MaxAlign)
+      DataAlignV = ABI.MaxAlign;
+
+    DataAlign = Align(DataAlignV);
+    MaxFieldAlign = std::max(MaxFieldAlign, DataAlign);
+
+    if (uint64_t Rem = CurrentOffset % DataAlignV) {
+      // Inject explicit padding to deal with alignment requirements
+      uint64_t Padding = DataAlignV - Rem;
+      Frame.padding(Ctx, Padding);
+      CurrentOffset += Padding;
+    }
+
+    if (IsByVal) {
+      Frame.byVal(ArgType, ArgVal);
+    } else {
+      Frame.value(ArgType, ArgVal);
+    }
+    CurrentOffset += DL.getTypeAllocSize(ArgType).getFixedValue();
+  }
+
+  if (Frame.empty()) {
+    // Not passing any arguments, hopefully va_arg won't try to read any
+    // Creating a single byte frame containing nothing to point the va_list
+    // instance as that is less special-casey in the compiler and probably
+    // easier to interpret in a debugger.
+    Frame.padding(Ctx, 1);
+  }
+
+  Function *CBF = CB->getParent()->getParent();
+
+  StructType *VarargsTy = Frame.asStruct(Ctx, CBF->getName());
+
+  // Put the alloca to hold the variadic args in the entry basic block.
+  // The clumsy construction is to set a the alignment on the instance
+  Builder.SetInsertPointPastAllocas(CBF);
+
+  // The struct instance needs to be at least MaxFieldAlign for the alignment of
+  // the fields to be correct at runtime. Use the native stack alignment instead
+  // if that's greater as that tends to give better codegen.
+  Align AllocaAlign = MaxFieldAlign;
+  if (DL.exceedsNaturalStackAlignment(Align(1024))) {
+    // TODO: DL.getStackAlignment could return a MaybeAlign instead of assert
+    AllocaAlign = std::max(AllocaAlign, DL.getStackAlignment());
+  }
+
+  AllocaInst *Alloced = Builder.Insert(
+      new AllocaInst(VarargsTy, DL.getAllocaAddrSpace(), nullptr, AllocaAlign),
+      "vararg_buffer");
+  Changed = true;
+  assert(Alloced->getAllocatedType() == VarargsTy);
+
+  // Initialise the fields in the struct
+  Builder.SetInsertPoint(CB);
+
+  Builder.CreateLifetimeStart(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
+
+  Frame.initialiseStructAlloca(DL, Builder, Alloced);
+
+  unsigned NumArgs = FuncType->getNumParams();
+
+  SmallVector<Value *> Args;
+  Args.assign(CB->arg_begin(), CB->arg_begin() + NumArgs);
+
+  // Initialise a va_list pointing to that struct and pass it as the last
+  // argument
+  AllocaInst *VaList = nullptr;
+  {
+    if (!ABI.VAList->passedInSSARegister()) {
+      Type *VaListTy = ABI.VAList->vaListType(Ctx);
+      Builder.SetInsertPointPastAllocas(CBF);
+      VaList = Builder.CreateAlloca(VaListTy, nullptr, "va_list");
+      Builder.SetInsertPoint(CB);
+      Builder.CreateLifetimeStart(VaList, sizeOfAlloca(Ctx, DL, VaList));
+    }
+    Args.push_back(ABI.VAList->initializeVAList(Ctx, Builder, VaList, Alloced));
+  }
+
+  // Attributes excluding any on the vararg arguments
+  AttributeList PAL = CB->getAttributes();
+  if (!PAL.isEmpty()) {
+    SmallVector<AttributeSet, 8> ArgAttrs;
+    for (unsigned ArgNo = 0; ArgNo < NumArgs; ArgNo++)
+      ArgAttrs.push_back(PAL.getParamAttrs(ArgNo));
+    PAL =
+        AttributeList::get(Ctx, PAL.getFnAttrs(), PAL.getRetAttrs(), ArgAttrs);
+  }
+
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  CB->getOperandBundlesAsDefs(OpBundles);
+
+  CallBase *NewCB = nullptr;
+  // TODO, other instructions? Haven't managed to write variadic inline asm yet
+  if (CallInst *CI = dyn_cast<CallInst>(CB)) {
+
+    Value *Dst = NF ? NF : CI->getCalledOperand();
+    FunctionType *NFTy = inlinableVariadicFunctionType(M, VarargFunctionType);
+
+    NewCB = CallInst::Create(NFTy, Dst, Args, OpBundles, "", CI);
+
+    CallInst::TailCallKind TCK = CI->getTailCallKind();
+    assert(TCK != CallInst::TCK_MustTail); // guarded at prologue
+
+    // It doesn't get to be a tail call any more
+    // might want to guard this with arch, x64 and aarch64 document that
+    // varargs can't be tail called anyway
+    // Not totally convinced this is necessary but dead store elimination
+    // will discard the stores to the Alloca and pass uninitialised memory along
+    // instead when the function is marked tailcall
+    if (TCK == CallInst::TCK_Tail) {
+      TCK = CallInst::TCK_None;
+    }
+    CI->setTailCallKind(TCK);
+
+  } else if (InvokeInst *II = dyn_cast<InvokeInst>(CB)) {
+    assert(NF);
+    NewCB = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                               Args, OpBundles, "", CB);
+  } else {
+    report_fatal_error("Unimplemented variadic lowering for CallInst");
+  }
+
+  if (VaList)
+    Builder.CreateLifetimeEnd(VaList, sizeOfAlloca(Ctx, DL, VaList));
+
+  Builder.CreateLifetimeEnd(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
+
+  NewCB->setAttributes(PAL);
+  NewCB->takeName(CB);
+  NewCB->setCallingConv(CB->getCallingConv());
+
+  NewCB->setDebugLoc(DebugLoc());
+
+  // I think this is upsetting the debug handling (DISubprogram attached to more
+  // than one function) Need to move metadata, not copy it?
+  NewCB->copyMetadata(*CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
+
+  CB->replaceAllUsesWith(NewCB);
+  CB->eraseFromParent();
+  return Changed;
+}
+
+bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &Builder,
+                                            const DataLayout &DL,
+                                            VAStartInst *Inst) {
+  Function *ContainingFunction = Inst->getFunction();
+  if (ContainingFunction->isVarArg())
+    return false;
+
+  // The last argument is a vaListParameterType
+  Argument *PassedVaList =
+      ContainingFunction->getArg(ContainingFunction->arg_size() - 1);
+
+  // va_start takes a pointer to a va_list, e.g. one on the stack
+  Value *VaStartArg = Inst->getArgList();
+
+  Builder.SetInsertPoint(Inst);
+  if (ABI.VAList->passedInSSARegister()) {
+    Builder.CreateStore(PassedVaList, VaStartArg);
+  } else {
+    // src and dst are both pointers
+    memcpyVAListPointers(DL, Builder, VaStartArg, PassedVaList);
+  }
+
+  Inst->eraseFromParent();
+  return true;
+}
+
+bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &,
+                                            VAEndInst *Inst) {
+  // A no-op on all the architectures implemented so far
+  Inst->eraseFromParent();
+  return true;
+}
+
+bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &Builder,
+                                            const DataLayout &DL,
+                                            VACopyInst *Inst) {
+  Builder.SetInsertPoint(Inst);
+  memcpyVAListPointers(DL, Builder, Inst->getDest(), Inst->getSrc());
+  Inst->eraseFromParent();
+  return true;
+}
+
+} // namespace
+
+char ExpandVariadics::ID = 0;
+
+INITIALIZE_PASS(ExpandVariadics, DEBUG_TYPE, "Expand variadic functions", false,
+                false)
+
+ModulePass *llvm::createExpandVariadicsPass(ExpandVariadicsMode Mode) {
+  return new ExpandVariadics(Mode);
+}
+
+PreservedAnalyses ExpandVariadicsPass::run(Module &M, ModuleAnalysisManager &) {
+  return ExpandVariadics(ConstructedMode).runOnModule(M)
+             ? PreservedAnalyses::none()
+             : PreservedAnalyses::all();
+}
+
+ExpandVariadicsPass::ExpandVariadicsPass(OptimizationLevel Level)
+    : ExpandVariadicsPass(Level == OptimizationLevel::O0
+                              ? ExpandVariadicsMode::disable
+                              : ExpandVariadicsMode::optimize) {}
+
+ExpandVariadicsPass::ExpandVariadicsPass(ExpandVariadicsMode Mode)
+    : ConstructedMode(Mode) {}
diff --git a/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
new file mode 100644
index 00000000000000..b270bb5bfb74b0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
@@ -0,0 +1,499 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+; Check the variables are lowered to the locations this target expects
+
+; The types show the call frames
+; CHECK: %single_i32.vararg = type <{ i32 }>
+; CHECK: %single_double.vararg = type <{ double }>
+; CHECK: %single_v4f32.vararg = type <{ <4 x float> }>
+; CHECK: %single_v8f32.vararg = type <{ <8 x float> }>
+; CHECK: %single_v16f32.vararg = type <{ <16 x float> }>
+; CHECK: %single_v32f32.vararg = type <{ <32 x float> }>
+; CHECK: %i32_double.vararg = type <{ i32, [4 x i8], double }>
+; CHECK: %double_i32.vararg = type <{ double, i32 }>
+; CHECK: %i32_libcS.vararg = type <{ i32, i8, [1 x i8], i16, i32, [4 x i8], i64, float, [4 x i8], double }>
+; CHECK: %libcS_i32.vararg = type <{ i8, [1 x i8], i16, i32, i64, float, [4 x i8], double, i32 }>
+; CHECK: %i32_v4f32.vararg = type <{ i32, [12 x i8], <4 x float> }>
+; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }>
+; CHECK: %i32_v8f32.vararg = type <{ i32, [28 x i8], <8 x float> }>
+; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }>
+; CHECK: %i32_v16f32.vararg = type <{ i32, [60 x i8], <16 x float> }>
+; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }>
+; CHECK: %i32_v32f32.vararg = type <{ i32, [124 x i8], <32 x float> }>
+; CHECK: %v32f32_i32.vararg = type <{ <32 x float>, i32 }>
+; CHECK: %indirect_single_i32.vararg = type <{ i32 }>
+
+ at vararg_ptr = hidden addrspace(1) global ptr @vararg, align 8
+
+define hidden void @copy(ptr noundef %va) {
+; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va.addr = alloca ptr, align 8, addrspace(5)
+; CHECK-NEXT:    %cp = alloca ptr, align 8, addrspace(5)
+; CHECK-NEXT:    %va.addr.ascast = addrspacecast ptr addrspace(5) %va.addr to ptr
+; CHECK-NEXT:    %cp.ascast = addrspacecast ptr addrspace(5) %cp to ptr
+; CHECK-NEXT:    store ptr %va, ptr addrspace(5) %va.addr, align 8, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %cp) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %cp.ascast, ptr %va.addr.ascast, i32 8, i1 false)
+; CHECK-NEXT:    %0 = load ptr, ptr addrspace(5) %cp, align 8, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %cp) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %va.addr = alloca ptr, align 8, addrspace(5)
+  %cp = alloca ptr, align 8, addrspace(5)
+  %va.addr.ascast = addrspacecast ptr addrspace(5) %va.addr to ptr
+  %cp.ascast = addrspacecast ptr addrspace(5) %cp to ptr
+  store ptr %va, ptr addrspace(5) %va.addr, align 8, !tbaa !4
+  call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %cp) #2
+  call void @llvm.va_copy.p0(ptr %cp.ascast, ptr nonnull %va.addr.ascast)
+  %0 = load ptr, ptr addrspace(5) %cp, align 8, !tbaa !4
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %cp) #2
+  ret void
+}
+
+declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture) #0
+
+declare void @llvm.va_copy.p0(ptr, ptr) #1
+
+declare hidden void @valist(ptr noundef)
+
+declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) #0
+
+define hidden void @start_once(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_once(ptr addrspace(5) noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s = alloca ptr, align 8, addrspace(5)
+; CHECK-NEXT:    %s.ascast = addrspacecast ptr addrspace(5) %s to ptr
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s) #2
+; CHECK-NEXT:    store ptr addrspace(5) %varargs, ptr %s.ascast, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr addrspace(5) %s, align 8, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s = alloca ptr, align 8, addrspace(5)
+  %s.ascast = addrspacecast ptr addrspace(5) %s to ptr
+  call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s) #2
+  call void @llvm.va_start.p0(ptr %s.ascast)
+  %0 = load ptr, ptr addrspace(5) %s, align 8, !tbaa !4
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s.ascast)
+  call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s) #2
+  ret void
+}
+
+declare void @llvm.va_start.p0(ptr) #1
+
+declare void @llvm.va_end.p0(ptr) #1
+
+define hidden void @start_twice(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr addrspace(5) noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s0 = alloca ptr, align 8, addrspace(5)
+; CHECK-NEXT:    %s1 = alloca ptr, align 8, addrspace(5)
+; CHECK-NEXT:    %s0.ascast = addrspacecast ptr addrspace(5) %s0 to ptr
+; CHECK-NEXT:    %s1.ascast = addrspacecast ptr addrspace(5) %s1 to ptr
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s1) #2
+; CHECK-NEXT:    store ptr addrspace(5) %varargs, ptr %s0.ascast, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr addrspace(5) %s0, align 8, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    store ptr addrspace(5) %varargs, ptr %s1.ascast, align 4
+; CHECK-NEXT:    %1 = load ptr, ptr addrspace(5) %s1, align 8, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s0) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s0 = alloca ptr, align 8, addrspace(5)
+  %s1 = alloca ptr, align 8, addrspace(5)
+  %s0.ascast = addrspacecast ptr addrspace(5) %s0 to ptr
+  %s1.ascast = addrspacecast ptr addrspace(5) %s1 to ptr
+  call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s0) #2
+  call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s1) #2
+  call void @llvm.va_start.p0(ptr %s0.ascast)
+  %0 = load ptr, ptr addrspace(5) %s0, align 8, !tbaa !4
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s0.ascast)
+  call void @llvm.va_start.p0(ptr %s1.ascast)
+  %1 = load ptr, ptr addrspace(5) %s1, align 8, !tbaa !4
+  call void @valist(ptr noundef %1) #2
+  call void @llvm.va_end.p0(ptr %s1.ascast)
+  call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s1) #2
+  call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s0) #2
+  ret void
+}
+
+define hidden void @single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_i32.vararg, align 4, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %0, align 4
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x) #2
+  ret void
+}
+
+declare hidden void @vararg(...)
+
+define hidden void @single_double(double noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_double.vararg, align 8, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_double.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr addrspace(5) %0, align 8
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x) #2
+  ret void
+}
+
+define hidden void @single_v4f32(<4 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v4f32.vararg, align 16, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 16, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v4f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr addrspace(5) %0, align 16
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 16, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x) #2
+  ret void
+}
+
+define hidden void @single_v8f32(<8 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v8f32(<8 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v8f32.vararg, align 32, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v8f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr addrspace(5) %0, align 32
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x) #2
+  ret void
+}
+
+define hidden void @single_v16f32(<16 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v16f32(<16 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v16f32.vararg, align 64, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 64, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v16f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr addrspace(5) %0, align 64
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 64, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x) #2
+  ret void
+}
+
+define hidden void @single_v32f32(<32 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v32f32(<32 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v32f32.vararg, align 128, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 128, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v32f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr addrspace(5) %0, align 128
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 128, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x) #2
+  ret void
+}
+
+define hidden void @i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_double.vararg, align 8, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 16, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_double.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_double.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store double %y, ptr addrspace(5) %1, align 8
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 16, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, double noundef %y) #2
+  ret void
+}
+
+define hidden void @double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %double_i32.vararg, align 8, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 12, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %double_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr addrspace(5) %0, align 8
+; CHECK-NEXT:    %1 = getelementptr inbounds %double_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr addrspace(5) %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 12, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define hidden void @i32_libcS(i32 noundef %x, i8 %y.coerce0, i16 %y.coerce1, i32 %y.coerce2, i64 %y.coerce3, float %y.coerce4, double %y.coerce5) {
+; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, i8 %y.coerce0, i16 %y.coerce1, i32 %y.coerce2, i64 %y.coerce3, float %y.coerce4, double %y.coerce5) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_libcS.vararg, align 8, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 40, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i8 %y.coerce0, ptr addrspace(5) %1, align 1
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 3
+; CHECK-NEXT:    store i16 %y.coerce1, ptr addrspace(5) %2, align 2
+; CHECK-NEXT:    %3 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 4
+; CHECK-NEXT:    store i32 %y.coerce2, ptr addrspace(5) %3, align 4
+; CHECK-NEXT:    %4 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 6
+; CHECK-NEXT:    store i64 %y.coerce3, ptr addrspace(5) %4, align 8
+; CHECK-NEXT:    %5 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 7
+; CHECK-NEXT:    store float %y.coerce4, ptr addrspace(5) %5, align 4
+; CHECK-NEXT:    %6 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 9
+; CHECK-NEXT:    store double %y.coerce5, ptr addrspace(5) %6, align 8
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 40, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, i8 %y.coerce0, i16 %y.coerce1, i32 %y.coerce2, i64 %y.coerce3, float %y.coerce4, double %y.coerce5) #2
+  ret void
+}
+
+define hidden void @libcS_i32(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i64 %x.coerce3, float %x.coerce4, double %x.coerce5, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@libcS_i32(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i64 %x.coerce3, float %x.coerce4, double %x.coerce5, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %libcS_i32.vararg, align 8, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i8 %x.coerce0, ptr addrspace(5) %0, align 1
+; CHECK-NEXT:    %1 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store i16 %x.coerce1, ptr addrspace(5) %1, align 2
+; CHECK-NEXT:    %2 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 3
+; CHECK-NEXT:    store i32 %x.coerce2, ptr addrspace(5) %2, align 4
+; CHECK-NEXT:    %3 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 4
+; CHECK-NEXT:    store i64 %x.coerce3, ptr addrspace(5) %3, align 8
+; CHECK-NEXT:    %4 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 5
+; CHECK-NEXT:    store float %x.coerce4, ptr addrspace(5) %4, align 4
+; CHECK-NEXT:    %5 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 7
+; CHECK-NEXT:    store double %x.coerce5, ptr addrspace(5) %5, align 8
+; CHECK-NEXT:    %6 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 8
+; CHECK-NEXT:    store i32 %y, ptr addrspace(5) %6, align 4
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i64 %x.coerce3, float %x.coerce4, double %x.coerce5, i32 noundef %y) #2
+  ret void
+}
+
+define hidden void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v4f32.vararg, align 16, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v4f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v4f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <4 x float> %y, ptr addrspace(5) %1, align 16
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <4 x float> noundef %y) #2
+  ret void
+}
+
+define hidden void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v4f32_i32.vararg, align 16, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 20, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v4f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr addrspace(5) %0, align 16
+; CHECK-NEXT:    %1 = getelementptr inbounds %v4f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr addrspace(5) %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 20, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define hidden void @i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v8f32.vararg, align 32, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 64, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v8f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v8f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <8 x float> %y, ptr addrspace(5) %1, align 32
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 64, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <8 x float> noundef %y) #2
+  ret void
+}
+
+define hidden void @v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v8f32_i32.vararg, align 32, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v8f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr addrspace(5) %0, align 32
+; CHECK-NEXT:    %1 = getelementptr inbounds %v8f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr addrspace(5) %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define hidden void @i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v16f32.vararg, align 64, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 128, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v16f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v16f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <16 x float> %y, ptr addrspace(5) %1, align 64
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 128, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <16 x float> noundef %y) #2
+  ret void
+}
+
+define hidden void @v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v16f32_i32.vararg, align 64, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 68, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v16f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr addrspace(5) %0, align 64
+; CHECK-NEXT:    %1 = getelementptr inbounds %v16f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr addrspace(5) %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 68, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define hidden void @i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v32f32.vararg, align 128, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v32f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v32f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <32 x float> %y, ptr addrspace(5) %1, align 128
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <32 x float> noundef %y) #2
+  ret void
+}
+
+define hidden void @v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v32f32_i32.vararg, align 128, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 132, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v32f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr addrspace(5) %0, align 128
+; CHECK-NEXT:    %1 = getelementptr inbounds %v32f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr addrspace(5) %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 132, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define hidden void @indirect_single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@indirect_single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %indirect_single_i32.vararg, align 4, addrspace(5)
+; CHECK-NEXT:    %0 = load volatile ptr, ptr addrspacecast (ptr addrspace(1) @vararg_ptr to ptr), align 8, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %indirect_single_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %1, align 4
+; CHECK-NEXT:    call void %0(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load volatile ptr, ptr addrspacecast (ptr addrspace(1) @vararg_ptr to ptr), align 8, !tbaa !4
+  tail call void (...) %0(i32 noundef %x) #2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { mustprogress }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"any pointer", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 0ff5dd3680dfab..ce26565cae8cd1 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -31,6 +31,7 @@
 ; GCN-O0-NEXT:    AMDGPU Remove Incompatible Functions
 ; GCN-O0-NEXT:    AMDGPU Printf lowering
 ; GCN-O0-NEXT:    Lower ctors and dtors for AMDGPU
+; GCN-O0-NEXT:    Expand variadic functions
 ; GCN-O0-NEXT:    AMDGPU Inline All Functions
 ; GCN-O0-NEXT:    Inliner for always_inline functions
 ; GCN-O0-NEXT:      FunctionPass Manager
@@ -177,6 +178,7 @@
 ; GCN-O1-NEXT:    AMDGPU Remove Incompatible Functions
 ; GCN-O1-NEXT:    AMDGPU Printf lowering
 ; GCN-O1-NEXT:    Lower ctors and dtors for AMDGPU
+; GCN-O1-NEXT:    Expand variadic functions
 ; GCN-O1-NEXT:    AMDGPU Inline All Functions
 ; GCN-O1-NEXT:    Inliner for always_inline functions
 ; GCN-O1-NEXT:      FunctionPass Manager
@@ -452,6 +454,7 @@
 ; GCN-O1-OPTS-NEXT:    AMDGPU Remove Incompatible Functions
 ; GCN-O1-OPTS-NEXT:    AMDGPU Printf lowering
 ; GCN-O1-OPTS-NEXT:    Lower ctors and dtors for AMDGPU
+; GCN-O1-OPTS-NEXT:    Expand variadic functions
 ; GCN-O1-OPTS-NEXT:    AMDGPU Inline All Functions
 ; GCN-O1-OPTS-NEXT:    Inliner for always_inline functions
 ; GCN-O1-OPTS-NEXT:      FunctionPass Manager
@@ -757,6 +760,7 @@
 ; GCN-O2-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O2-NEXT:    FunctionPass Manager
 ; GCN-O2-NEXT:      AMDGPU Image Intrinsic Optimizer
+; GCN-O2-NEXT:    Expand variadic functions
 ; GCN-O2-NEXT:    AMDGPU Inline All Functions
 ; GCN-O2-NEXT:    Inliner for always_inline functions
 ; GCN-O2-NEXT:      FunctionPass Manager
@@ -1066,6 +1070,7 @@
 ; GCN-O3-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O3-NEXT:    FunctionPass Manager
 ; GCN-O3-NEXT:      AMDGPU Image Intrinsic Optimizer
+; GCN-O3-NEXT:    Expand variadic functions
 ; GCN-O3-NEXT:    AMDGPU Inline All Functions
 ; GCN-O3-NEXT:    Inliner for always_inline functions
 ; GCN-O3-NEXT:      FunctionPass Manager
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
index 694f444b7747da..3cec8035fc8476 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
@@ -43,25 +43,6 @@ define i32 @test_tail_call(ptr addrspace(1) %out, ptr addrspace(1) %in) {
   ret i32 %c
 }
 
-declare void @external.varargs(i32, double, i64, ...)
-
-; GCN: error: <unknown>:0:0: in function test_call_varargs void (): unsupported call to variadic function external.varargs
-; R600: in function test_call_varargs{{.*}}: unsupported call to function external.varargs
-define void @test_call_varargs() {
-  call void (i32, double, i64, ...) @external.varargs(i32 42, double 1.0, i64 12, i8 3, i16 1, i32 4, float 1.0, double 2.0)
-  ret void
-}
-
-declare i32 @extern_variadic(...)
-
-; GCN: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported required tail call to function extern_variadic
-; R600: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported call to function extern_variadic
-define i32 @test_tail_call_bitcast_extern_variadic(<4 x float> %arg0, <4 x float> %arg1, i32 %arg2) {
-  %add = fadd <4 x float> %arg0, %arg1
-  %call = tail call i32 @extern_variadic(<4 x float> %add)
-  ret i32 %call
-}
-
 ; R600: in function test_c_call{{.*}}: unsupported call to function defined_function
 define amdgpu_ps i32 @test_c_call_from_shader() {
   %call = call i32 @defined_function(i32 0)
diff --git a/llvm/test/CodeGen/NVPTX/expand-variadic-call.ll b/llvm/test/CodeGen/NVPTX/expand-variadic-call.ll
new file mode 100644
index 00000000000000..0e444cb6b8bf6c
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/expand-variadic-call.ll
@@ -0,0 +1,468 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; Check the variables are lowered to the locations this target expects
+
+; The types show the call frames
+; CHECK: %single_i32.vararg = type <{ i32 }>
+; CHECK: %single_double.vararg = type <{ double }>
+; CHECK: %single_v4f32.vararg = type <{ <4 x float> }>
+; CHECK: %single_v8f32.vararg = type <{ <8 x float> }>
+; CHECK: %single_v16f32.vararg = type <{ <16 x float> }>
+; CHECK: %single_v32f32.vararg = type <{ <32 x float> }>
+; CHECK: %i32_double.vararg = type <{ i32, [4 x i8], double }>
+; CHECK: %double_i32.vararg = type <{ double, i32 }>
+; CHECK: %i32_v4f32.vararg = type <{ i32, [12 x i8], <4 x float> }>
+; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }>
+; CHECK: %i32_v8f32.vararg = type <{ i32, [28 x i8], <8 x float> }>
+; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }>
+; CHECK: %i32_v16f32.vararg = type <{ i32, [60 x i8], <16 x float> }>
+; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }>
+; CHECK: %i32_v32f32.vararg = type <{ i32, [124 x i8], <32 x float> }>
+; CHECK: %v32f32_i32.vararg = type <{ <32 x float>, i32 }>
+; CHECK: %indirect_single_i32.vararg = type <{ i32 }>
+
+%struct.libcS = type { i8, i16, i32, i64, float, double }
+
+ at vararg_ptr = global ptr @vararg, align 8
+
+define void @copy(ptr noundef %va) {
+; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va.addr = alloca ptr, align 8
+; CHECK-NEXT:    %cp = alloca ptr, align 8
+; CHECK-NEXT:    store ptr %va, ptr %va.addr, align 8, !tbaa !2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %cp, ptr %va.addr, i32 8, i1 false)
+; CHECK-NEXT:    %0 = load ptr, ptr %cp, align 8, !tbaa !2
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %cp) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %va.addr = alloca ptr, align 8
+  %cp = alloca ptr, align 8
+  store ptr %va, ptr %va.addr, align 8, !tbaa !3
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %cp) #2
+  call void @llvm.va_copy.p0(ptr nonnull %cp, ptr nonnull %va.addr)
+  %0 = load ptr, ptr %cp, align 8, !tbaa !3
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %cp) #2
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+
+declare void @llvm.va_copy.p0(ptr, ptr) #1
+
+declare void @valist(ptr noundef)
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+define void @start_once(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_once(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s = alloca ptr, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %s) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s, align 8
+; CHECK-NEXT:    %0 = load ptr, ptr %s, align 8, !tbaa !2
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %s) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s = alloca ptr, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %s) #2
+  call void @llvm.va_start.p0(ptr nonnull %s)
+  %0 = load ptr, ptr %s, align 8, !tbaa !3
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %s) #2
+  ret void
+}
+
+declare void @llvm.va_start.p0(ptr) #1
+
+declare void @llvm.va_end.p0(ptr) #1
+
+define void @start_twice(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s0 = alloca ptr, align 8
+; CHECK-NEXT:    %s1 = alloca ptr, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %s1) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s0, align 8
+; CHECK-NEXT:    %0 = load ptr, ptr %s0, align 8, !tbaa !2
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s1, align 8
+; CHECK-NEXT:    %1 = load ptr, ptr %s1, align 8, !tbaa !2
+; CHECK-NEXT:    call void @valist(ptr noundef %1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %s0) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s0 = alloca ptr, align 8
+  %s1 = alloca ptr, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %s0) #2
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %s1) #2
+  call void @llvm.va_start.p0(ptr nonnull %s0)
+  %0 = load ptr, ptr %s0, align 8, !tbaa !3
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s0)
+  call void @llvm.va_start.p0(ptr nonnull %s1)
+  %1 = load ptr, ptr %s1, align 8, !tbaa !3
+  call void @valist(ptr noundef %1) #2
+  call void @llvm.va_end.p0(ptr %s1)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %s1) #2
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %s0) #2
+  ret void
+}
+
+define void @single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_i32.vararg, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x) #2
+  ret void
+}
+
+declare void @vararg(...)
+
+define void @single_double(double noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_double.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x) #2
+  ret void
+}
+
+define void @single_v4f32(<4 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v8f32(<8 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v8f32(<8 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v16f32(<16 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v16f32(<16 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v32f32(<32 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v32f32(<32 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v32f32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr %0, align 128
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x) #2
+  ret void
+}
+
+define void @i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_double.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store double %y, ptr %1, align 8
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, double noundef %y) #2
+  ret void
+}
+
+define void @double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %double_i32.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    %1 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 8 %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 8 %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_libcS.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %y, i64 32, i1 false)
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(%struct.libcS) align 8 %y) #2
+  ret void
+}
+
+define void @libcS_i32(ptr noundef byval(%struct.libcS) align 8 %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@libcS_i32(ptr noundef byval(%struct.libcS) align 8 %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %libcS_i32.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %0, ptr %x, i64 32, i1 false)
+; CHECK-NEXT:    %1 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(ptr noundef nonnull byval(%struct.libcS) align 8 %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <4 x float> %y, ptr %1, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <4 x float> noundef %y) #2
+  ret void
+}
+
+define void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v4f32_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    %1 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <8 x float> %y, ptr %1, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <8 x float> noundef %y) #2
+  ret void
+}
+
+define void @v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v8f32_i32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    %1 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <16 x float> %y, ptr %1, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <16 x float> noundef %y) #2
+  ret void
+}
+
+define void @v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v16f32_i32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    %1 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v32f32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 256, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <32 x float> %y, ptr %1, align 128
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 256, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <32 x float> noundef %y) #2
+  ret void
+}
+
+define void @v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v32f32_i32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr %0, align 128
+; CHECK-NEXT:    %1 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @indirect_single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@indirect_single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %indirect_single_i32.vararg, align 4
+; CHECK-NEXT:    %0 = load volatile ptr, ptr @vararg_ptr, align 8, !tbaa !2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %indirect_single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    call void %0(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load volatile ptr, ptr @vararg_ptr, align 8, !tbaa !3
+  tail call void (...) %0(i32 noundef %x) #2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { mustprogress }
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"any pointer", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/CodeGen/X86/expand-variadic-call-i386-darwin.ll b/llvm/test/CodeGen/X86/expand-variadic-call-i386-darwin.ll
new file mode 100644
index 00000000000000..538619c89f2fe3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/expand-variadic-call-i386-darwin.ll
@@ -0,0 +1,449 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
+target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.4.0"
+
+; Check the variables are lowered to the locations this target expects
+
+; The types show the call frames
+; CHECK: %single_i32.vararg = type <{ i32 }>
+; CHECK: %single_double.vararg = type <{ double }>
+; CHECK: %single_v4f32.vararg = type <{ <4 x float> }>
+; CHECK: %single_v8f32.vararg = type <{ <8 x float> }>
+; CHECK: %single_v16f32.vararg = type <{ <16 x float> }>
+; CHECK: %single_v32f32.vararg = type <{ <32 x float> }>
+; CHECK: %i32_double.vararg = type <{ i32, double }>
+; CHECK: %double_i32.vararg = type <{ double, i32 }>
+; CHECK: %i32_v4f32.vararg = type <{ i32, [12 x i8], <4 x float> }>
+; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }>
+; CHECK: %i32_v8f32.vararg = type <{ i32, [28 x i8], <8 x float> }>
+; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }>
+; CHECK: %i32_v16f32.vararg = type <{ i32, [60 x i8], <16 x float> }>
+; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }>
+; CHECK: %i32_v32f32.vararg = type <{ i32, [124 x i8], <32 x float> }>
+; CHECK: %v32f32_i32.vararg = type <{ <32 x float>, i32 }>
+
+%struct.libcS = type { i8, i16, i32, i32, float, double }
+
+define void @copy(ptr noundef %va) {
+; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va.addr = alloca ptr, align 4
+; CHECK-NEXT:    %cp = alloca ptr, align 4
+; CHECK-NEXT:    store ptr %va, ptr %va.addr, align 4, !tbaa !4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %cp, ptr %va.addr, i32 4, i1 false)
+; CHECK-NEXT:    %0 = load ptr, ptr %cp, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %va.addr = alloca ptr, align 4
+  %cp = alloca ptr, align 4
+  store ptr %va, ptr %va.addr, align 4, !tbaa !5
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) #2
+  call void @llvm.va_copy.p0(ptr nonnull %cp, ptr nonnull %va.addr)
+  %0 = load ptr, ptr %cp, align 4, !tbaa !5
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) #2
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+
+declare void @llvm.va_copy.p0(ptr, ptr) #1
+
+declare void @valist(ptr noundef)
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+define void @start_once(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_once(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr %s, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) #2
+  call void @llvm.va_start.p0(ptr nonnull %s)
+  %0 = load ptr, ptr %s, align 4, !tbaa !5
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) #2
+  ret void
+}
+
+declare void @llvm.va_start.p0(ptr) #1
+
+declare void @llvm.va_end.p0(ptr) #1
+
+define void @start_twice(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s0 = alloca ptr, align 4
+; CHECK-NEXT:    %s1 = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s0, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr %s0, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s1, align 4
+; CHECK-NEXT:    %1 = load ptr, ptr %s1, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s0 = alloca ptr, align 4
+  %s1 = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) #2
+  call void @llvm.va_start.p0(ptr nonnull %s0)
+  %0 = load ptr, ptr %s0, align 4, !tbaa !5
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s0)
+  call void @llvm.va_start.p0(ptr nonnull %s1)
+  %1 = load ptr, ptr %s1, align 4, !tbaa !5
+  call void @valist(ptr noundef %1) #2
+  call void @llvm.va_end.p0(ptr %s1)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) #2
+  ret void
+}
+
+define void @single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x) #2
+  ret void
+}
+
+declare void @vararg(...)
+
+define void @single_double(double noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_double.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x) #2
+  ret void
+}
+
+define void @single_v4f32(<4 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v8f32(<8 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v8f32(<8 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v16f32(<16 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v16f32(<16 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v32f32(<32 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v32f32(<32 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v32f32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr %0, align 128
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x) #2
+  ret void
+}
+
+define void @i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_double.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store double %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, double noundef %y) #2
+  ret void
+}
+
+define void @double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %double_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 4 %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 4 %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_libcS.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %y, i64 24, i1 false)
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(%struct.libcS) align 4 %y) #2
+  ret void
+}
+
+define void @libcS_i32(ptr noundef byval(%struct.libcS) align 4 %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@libcS_i32(ptr noundef byval(%struct.libcS) align 4 %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %libcS_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %0, ptr %x, i64 24, i1 false)
+; CHECK-NEXT:    %1 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(ptr noundef nonnull byval(%struct.libcS) align 4 %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <4 x float> %y, ptr %1, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <4 x float> noundef %y) #2
+  ret void
+}
+
+define void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v4f32_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    %1 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <8 x float> %y, ptr %1, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <8 x float> noundef %y) #2
+  ret void
+}
+
+define void @v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v8f32_i32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    %1 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <16 x float> %y, ptr %1, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <16 x float> noundef %y) #2
+  ret void
+}
+
+define void @v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v16f32_i32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    %1 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v32f32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 256, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <32 x float> %y, ptr %1, align 128
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 256, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <32 x float> noundef %y) #2
+  ret void
+}
+
+define void @v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v32f32_i32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr %0, align 128
+; CHECK-NEXT:    %1 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { mustprogress }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = !{i32 1, !"NumRegisterParameters", i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!3 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"any pointer", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/CodeGen/X86/expand-variadic-call-i386-linux.ll b/llvm/test/CodeGen/X86/expand-variadic-call-i386-linux.ll
new file mode 100644
index 00000000000000..3e06bef4ef5679
--- /dev/null
+++ b/llvm/test/CodeGen/X86/expand-variadic-call-i386-linux.ll
@@ -0,0 +1,449 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+; Check the variables are lowered to the locations this target expects
+
+; The types show the call frames
+; CHECK: %single_i32.vararg = type <{ i32 }>
+; CHECK: %single_double.vararg = type <{ double }>
+; CHECK: %single_v4f32.vararg = type <{ <4 x float> }>
+; CHECK: %single_v8f32.vararg = type <{ <8 x float> }>
+; CHECK: %single_v16f32.vararg = type <{ <16 x float> }>
+; CHECK: %single_v32f32.vararg = type <{ <32 x float> }>
+; CHECK: %i32_double.vararg = type <{ i32, double }>
+; CHECK: %double_i32.vararg = type <{ double, i32 }>
+; CHECK: %i32_v4f32.vararg = type <{ i32, [12 x i8], <4 x float> }>
+; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }>
+; CHECK: %i32_v8f32.vararg = type <{ i32, [28 x i8], <8 x float> }>
+; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }>
+; CHECK: %i32_v16f32.vararg = type <{ i32, [60 x i8], <16 x float> }>
+; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }>
+; CHECK: %i32_v32f32.vararg = type <{ i32, [124 x i8], <32 x float> }>
+; CHECK: %v32f32_i32.vararg = type <{ <32 x float>, i32 }>
+
+%struct.libcS = type { i8, i16, i32, i32, float, double }
+
+define void @copy(ptr noundef %va) {
+; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va.addr = alloca ptr, align 4
+; CHECK-NEXT:    %cp = alloca ptr, align 4
+; CHECK-NEXT:    store ptr %va, ptr %va.addr, align 4, !tbaa !4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %cp, ptr %va.addr, i32 4, i1 false)
+; CHECK-NEXT:    %0 = load ptr, ptr %cp, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %va.addr = alloca ptr, align 4
+  %cp = alloca ptr, align 4
+  store ptr %va, ptr %va.addr, align 4, !tbaa !5
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) #2
+  call void @llvm.va_copy.p0(ptr nonnull %cp, ptr nonnull %va.addr)
+  %0 = load ptr, ptr %cp, align 4, !tbaa !5
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) #2
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+
+declare void @llvm.va_copy.p0(ptr, ptr) #1
+
+declare void @valist(ptr noundef)
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+define void @start_once(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_once(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr %s, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) #2
+  call void @llvm.va_start.p0(ptr nonnull %s)
+  %0 = load ptr, ptr %s, align 4, !tbaa !5
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) #2
+  ret void
+}
+
+declare void @llvm.va_start.p0(ptr) #1
+
+declare void @llvm.va_end.p0(ptr) #1
+
+define void @start_twice(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s0 = alloca ptr, align 4
+; CHECK-NEXT:    %s1 = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s0, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr %s0, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s1, align 4
+; CHECK-NEXT:    %1 = load ptr, ptr %s1, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s0 = alloca ptr, align 4
+  %s1 = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) #2
+  call void @llvm.va_start.p0(ptr nonnull %s0)
+  %0 = load ptr, ptr %s0, align 4, !tbaa !5
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s0)
+  call void @llvm.va_start.p0(ptr nonnull %s1)
+  %1 = load ptr, ptr %s1, align 4, !tbaa !5
+  call void @valist(ptr noundef %1) #2
+  call void @llvm.va_end.p0(ptr %s1)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) #2
+  ret void
+}
+
+define void @single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x) #2
+  ret void
+}
+
+declare void @vararg(...)
+
+define void @single_double(double noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_double.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x) #2
+  ret void
+}
+
+define void @single_v4f32(<4 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v8f32(<8 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v8f32(<8 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v16f32(<16 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v16f32(<16 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v32f32(<32 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v32f32(<32 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v32f32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr %0, align 128
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x) #2
+  ret void
+}
+
+define void @i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_double.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store double %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, double noundef %y) #2
+  ret void
+}
+
+define void @double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %double_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 4 %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 4 %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_libcS.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %y, i64 24, i1 false)
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(%struct.libcS) align 4 %y) #2
+  ret void
+}
+
+define void @libcS_i32(ptr noundef byval(%struct.libcS) align 4 %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@libcS_i32(ptr noundef byval(%struct.libcS) align 4 %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %libcS_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %0, ptr %x, i64 24, i1 false)
+; CHECK-NEXT:    %1 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(ptr noundef nonnull byval(%struct.libcS) align 4 %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <4 x float> %y, ptr %1, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <4 x float> noundef %y) #2
+  ret void
+}
+
+define void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v4f32_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    %1 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <8 x float> %y, ptr %1, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <8 x float> noundef %y) #2
+  ret void
+}
+
+define void @v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v8f32_i32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    %1 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <16 x float> %y, ptr %1, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <16 x float> noundef %y) #2
+  ret void
+}
+
+define void @v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v16f32_i32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    %1 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v32f32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 256, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <32 x float> %y, ptr %1, align 128
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 256, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <32 x float> noundef %y) #2
+  ret void
+}
+
+define void @v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v32f32_i32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr %0, align 128
+; CHECK-NEXT:    %1 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { mustprogress }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = !{i32 1, !"NumRegisterParameters", i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!3 = !{i32 7, !"PIE Level", i32 2}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"any pointer", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/CodeGen/X86/expand-variadic-call-i686-msvc.ll b/llvm/test/CodeGen/X86/expand-variadic-call-i686-msvc.ll
new file mode 100644
index 00000000000000..564fc783a92656
--- /dev/null
+++ b/llvm/test/CodeGen/X86/expand-variadic-call-i686-msvc.ll
@@ -0,0 +1,467 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
+target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32-a:0:32-S32"
+target triple = "i686-unknown-windows-msvc19.33.0"
+
+; Check the variables are lowered to the locations this target expects
+
+; The types show the call frames
+; CHECK: %single_i32.vararg = type <{ i32 }>
+; CHECK: %single_double.vararg = type <{ double }>
+; CHECK: %single_v4f32.vararg = type <{ <4 x float> }>
+; CHECK: %single_v8f32.vararg = type <{ <8 x float> }>
+; CHECK: %single_v16f32.vararg = type <{ <16 x float> }>
+; CHECK: %single_v32f32.vararg = type <{ ptr }>
+; CHECK: %i32_double.vararg = type <{ i32, [4 x i8], double }>
+; CHECK: %double_i32.vararg = type <{ double, i32 }>
+; CHECK: %i32_v4f32.vararg = type <{ i32, [12 x i8], <4 x float> }>
+; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }>
+; CHECK: %i32_v8f32.vararg = type <{ i32, [28 x i8], <8 x float> }>
+; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }>
+; CHECK: %i32_v16f32.vararg = type <{ i32, [60 x i8], <16 x float> }>
+; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }>
+; CHECK: %i32_v32f32.vararg = type <{ i32, ptr }>
+; CHECK: %v32f32_i32.vararg = type <{ ptr, i32 }>
+
+%struct.libcS = type { i8, i16, i32, i32, float, double }
+
+define void @copy(ptr noundef %va) {
+; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va.addr = alloca ptr, align 4
+; CHECK-NEXT:    %cp = alloca ptr, align 4
+; CHECK-NEXT:    store ptr %va, ptr %va.addr, align 4, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %cp, ptr %va.addr, i32 4, i1 false)
+; CHECK-NEXT:    %0 = load ptr, ptr %cp, align 4, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %va.addr = alloca ptr, align 4
+  %cp = alloca ptr, align 4
+  store ptr %va, ptr %va.addr, align 4, !tbaa !4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) #2
+  call void @llvm.va_copy.p0(ptr nonnull %cp, ptr nonnull %va.addr)
+  %0 = load ptr, ptr %cp, align 4, !tbaa !4
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) #2
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+
+declare void @llvm.va_copy.p0(ptr, ptr) #1
+
+declare void @valist(ptr noundef)
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+define void @start_once(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_once(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr %s, align 4, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) #2
+  call void @llvm.va_start.p0(ptr nonnull %s)
+  %0 = load ptr, ptr %s, align 4, !tbaa !4
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) #2
+  ret void
+}
+
+declare void @llvm.va_start.p0(ptr) #1
+
+declare void @llvm.va_end.p0(ptr) #1
+
+define void @start_twice(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s0 = alloca ptr, align 4
+; CHECK-NEXT:    %s1 = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s0, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr %s0, align 4, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s1, align 4
+; CHECK-NEXT:    %1 = load ptr, ptr %s1, align 4, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s0 = alloca ptr, align 4
+  %s1 = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) #2
+  call void @llvm.va_start.p0(ptr nonnull %s0)
+  %0 = load ptr, ptr %s0, align 4, !tbaa !4
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s0)
+  call void @llvm.va_start.p0(ptr nonnull %s1)
+  %1 = load ptr, ptr %s1, align 4, !tbaa !4
+  call void @valist(ptr noundef %1) #2
+  call void @llvm.va_end.p0(ptr %s1)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) #2
+  ret void
+}
+
+define void @single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_i32.vararg, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x) #2
+  ret void
+}
+
+declare void @vararg(...)
+
+define void @single_double(double noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_double.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x) #2
+  ret void
+}
+
+define void @single_v4f32(<4 x float> inreg noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> inreg noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> inreg noundef %x) #2
+  ret void
+}
+
+define void @single_v8f32(<8 x float> inreg noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v8f32(<8 x float> inreg noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> inreg noundef %x) #2
+  ret void
+}
+
+define void @single_v16f32(<16 x float> inreg noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v16f32(<16 x float> inreg noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> inreg noundef %x) #2
+  ret void
+}
+
+define void @single_v32f32(ptr nocapture noundef readonly %0) {
+; CHECK-LABEL: define {{[^@]+}}@single_v32f32(ptr nocapture noundef readonly %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 128
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v32f32.vararg, align 4
+; CHECK-NEXT:    %x = load <32 x float>, ptr %0, align 128, !tbaa !7
+; CHECK-NEXT:    store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !7
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %single_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store ptr %indirect-arg-temp, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 128
+  %x = load <32 x float>, ptr %0, align 128, !tbaa !8
+  store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !8
+  call void (...) @vararg(ptr noundef nonnull %indirect-arg-temp) #2
+  ret void
+}
+
+define void @i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_double.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store double %y, ptr %1, align 8
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, double noundef %y) #2
+  ret void
+}
+
+define void @double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %double_i32.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    %1 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_libcS(i32 noundef %x, ptr nocapture noundef readonly byval(%struct.libcS) align 4 %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, ptr nocapture noundef readonly byval(%struct.libcS) align 4 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_libcS.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %0, i64 24, i1 false)
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(%struct.libcS) align 4 %0) #2
+  ret void
+}
+
+define void @libcS_i32(ptr nocapture noundef readonly byval(%struct.libcS) align 4 %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@libcS_i32(ptr nocapture noundef readonly byval(%struct.libcS) align 4 %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %libcS_i32.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %0, i64 24, i1 false)
+; CHECK-NEXT:    %2 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(ptr noundef nonnull byval(%struct.libcS) align 4 %0, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v4f32(i32 noundef %x, <4 x float> inreg noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> inreg noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <4 x float> %y, ptr %1, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <4 x float> inreg noundef %y) #2
+  ret void
+}
+
+define void @v4f32_i32(<4 x float> inreg noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> inreg noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v4f32_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    %1 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> inreg noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v8f32(i32 noundef %x, <8 x float> inreg noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, <8 x float> inreg noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <8 x float> %y, ptr %1, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <8 x float> inreg noundef %y) #2
+  ret void
+}
+
+define void @v8f32_i32(<8 x float> inreg noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(<8 x float> inreg noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v8f32_i32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    %1 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> inreg noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v16f32(i32 noundef %x, <16 x float> inreg noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, <16 x float> inreg noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <16 x float> %y, ptr %1, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <16 x float> inreg noundef %y) #2
+  ret void
+}
+
+define void @v16f32_i32(<16 x float> inreg noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(<16 x float> inreg noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v16f32_i32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    %1 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> inreg noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v32f32(i32 noundef %x, ptr nocapture noundef readonly %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, ptr nocapture noundef readonly %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 128
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v32f32.vararg, align 4
+; CHECK-NEXT:    %y = load <32 x float>, ptr %0, align 128, !tbaa !7
+; CHECK-NEXT:    store <32 x float> %y, ptr %indirect-arg-temp, align 128, !tbaa !7
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store ptr %indirect-arg-temp, ptr %2, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 128
+  %y = load <32 x float>, ptr %0, align 128, !tbaa !8
+  store <32 x float> %y, ptr %indirect-arg-temp, align 128, !tbaa !8
+  call void (...) @vararg(i32 noundef %x, ptr noundef nonnull %indirect-arg-temp) #2
+  ret void
+}
+
+define void @v32f32_i32(ptr nocapture noundef readonly %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(ptr nocapture noundef readonly %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 128
+; CHECK-NEXT:    %vararg_buffer = alloca %v32f32_i32.vararg, align 4
+; CHECK-NEXT:    %x = load <32 x float>, ptr %0, align 128, !tbaa !7
+; CHECK-NEXT:    store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !7
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store ptr %indirect-arg-temp, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 128
+  %x = load <32 x float>, ptr %0, align 128, !tbaa !8
+  store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !8
+  call void (...) @vararg(ptr noundef nonnull %indirect-arg-temp, i32 noundef %y) #2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { mustprogress }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"NumRegisterParameters", i32 0}
+!1 = !{i32 1, !"wchar_size", i32 2}
+!2 = !{i32 1, !"MaxTLSAlign", i32 65536}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"any pointer", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = !{!6, !6, i64 0}
diff --git a/llvm/test/CodeGen/X86/expand-variadic-call-x64-darwin.ll b/llvm/test/CodeGen/X86/expand-variadic-call-x64-darwin.ll
new file mode 100644
index 00000000000000..b1437edbe9a5a0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/expand-variadic-call-x64-darwin.ll
@@ -0,0 +1,688 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.4.0"
+
+; Check the variables are lowered to the locations this target expects
+
+; The types show the call frames
+; CHECK: %single_i32.vararg = type <{ i32 }>
+; CHECK: %single_double.vararg = type <{ double }>
+; CHECK: %single_v4f32.vararg = type <{ <4 x float> }>
+; CHECK: %single_v8f32.vararg = type <{ <8 x float> }>
+; CHECK: %single_v16f32.vararg = type <{ <16 x float> }>
+; CHECK: %single_v32f32.vararg = type <{ <32 x float> }>
+; CHECK: %i32_double.vararg = type <{ i32, [4 x i8], double }>
+; CHECK: %double_i32.vararg = type <{ double, i32 }>
+; CHECK: %i32_v4f32.vararg = type <{ i32, [4 x i8], <4 x float> }>
+; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }>
+; CHECK: %i32_v8f32.vararg = type <{ i32, [4 x i8], <8 x float> }>
+; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }>
+; CHECK: %i32_v16f32.vararg = type <{ i32, [4 x i8], <16 x float> }>
+; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }>
+; CHECK: %i32_v32f32.vararg = type <{ i32, [4 x i8], <32 x float> }>
+; CHECK: %v32f32_i32.vararg = type <{ <32 x float>, i32 }>
+
+%struct.__va_list_tag = type { i32, i32, ptr, ptr }
+%struct.libcS = type { i8, i16, i32, i64, float, double }
+
+define void @copy(ptr noundef %va) {
+; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %cp = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %cp, ptr %va, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %cp) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cp = alloca [1 x %struct.__va_list_tag], align 16
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %cp) #2
+  call void @llvm.va_copy.p0(ptr nonnull %cp, ptr %va)
+  call void @valist(ptr noundef nonnull %cp) #2
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %cp) #2
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+
+declare void @llvm.va_copy.p0(ptr, ptr) #1
+
+declare void @valist(ptr noundef)
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+define void @start_once(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_once(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %s, ptr %varargs, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %s) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s = alloca [1 x %struct.__va_list_tag], align 16
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s) #2
+  call void @llvm.va_start.p0(ptr nonnull %s)
+  call void @valist(ptr noundef nonnull %s) #2
+  call void @llvm.va_end.p0(ptr %s)
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s) #2
+  ret void
+}
+
+declare void @llvm.va_start.p0(ptr) #1
+
+declare void @llvm.va_end.p0(ptr) #1
+
+define void @start_twice(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s0 = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    %s1 = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %s0, ptr %varargs, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %s1, ptr %varargs, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s0) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s0 = alloca [1 x %struct.__va_list_tag], align 16
+  %s1 = alloca [1 x %struct.__va_list_tag], align 16
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s0) #2
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s1) #2
+  call void @llvm.va_start.p0(ptr nonnull %s0)
+  call void @valist(ptr noundef nonnull %s0) #2
+  call void @llvm.va_end.p0(ptr %s0)
+  call void @llvm.va_start.p0(ptr nonnull %s1)
+  call void @valist(ptr noundef nonnull %s1) #2
+  call void @llvm.va_end.p0(ptr %s1)
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s1) #2
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s0) #2
+  ret void
+}
+
+define void @single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x) #2
+  ret void
+}
+
+declare void @vararg(...)
+
+define void @single_double(double noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_double.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x) #2
+  ret void
+}
+
+define void @single_v4f32(<4 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v4f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v8f32(ptr nocapture noundef readonly byval(<8 x float>) align 16 %0) {
+; CHECK-LABEL: define {{[^@]+}}@single_v8f32(ptr nocapture noundef readonly byval(<8 x float>) align 16 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <8 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v8f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <8 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <8 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %single_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <8 x float>, align 16
+  %x = load <8 x float>, ptr %0, align 16, !tbaa !4
+  store <8 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<8 x float>) align 16 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @single_v16f32(ptr nocapture noundef readonly byval(<16 x float>) align 16 %0) {
+; CHECK-LABEL: define {{[^@]+}}@single_v16f32(ptr nocapture noundef readonly byval(<16 x float>) align 16 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <16 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v16f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <16 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <16 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %single_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <16 x float>, align 16
+  %x = load <16 x float>, ptr %0, align 16, !tbaa !4
+  store <16 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<16 x float>) align 16 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @single_v32f32(ptr nocapture noundef readonly byval(<32 x float>) align 16 %0) {
+; CHECK-LABEL: define {{[^@]+}}@single_v32f32(ptr nocapture noundef readonly byval(<32 x float>) align 16 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v32f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <32 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <32 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %single_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 16
+  %x = load <32 x float>, ptr %0, align 16, !tbaa !4
+  store <32 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<32 x float>) align 16 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_double.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store double %y, ptr %1, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, double noundef %y) #2
+  ret void
+}
+
+define void @double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %double_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    %1 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 8 %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 8 %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_libcS.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %y, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(%struct.libcS) align 8 %y) #2
+  ret void
+}
+
+define void @libcS_i32(ptr noundef byval(%struct.libcS) align 8 %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@libcS_i32(ptr noundef byval(%struct.libcS) align 8 %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %libcS_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %0, ptr %x, i64 32, i1 false)
+; CHECK-NEXT:    %1 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(ptr noundef nonnull byval(%struct.libcS) align 8 %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v4f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <4 x float> %y, ptr %1, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <4 x float> noundef %y) #2
+  ret void
+}
+
+define void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v4f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    %1 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v8f32(i32 noundef %x, ptr nocapture noundef readonly byval(<8 x float>) align 16 %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, ptr nocapture noundef readonly byval(<8 x float>) align 16 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <8 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v8f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %y = load <8 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <8 x float> %y, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %indirect-arg-temp, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <8 x float>, align 16
+  %y = load <8 x float>, ptr %0, align 16, !tbaa !4
+  store <8 x float> %y, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(<8 x float>) align 16 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @v8f32_i32(ptr nocapture noundef readonly byval(<8 x float>) align 16 %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(ptr nocapture noundef readonly byval(<8 x float>) align 16 %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <8 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %v8f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <8 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <8 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 32, i1 false)
+; CHECK-NEXT:    %2 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <8 x float>, align 16
+  %x = load <8 x float>, ptr %0, align 16, !tbaa !4
+  store <8 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<8 x float>) align 16 %indirect-arg-temp, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v16f32(i32 noundef %x, ptr nocapture noundef readonly byval(<16 x float>) align 16 %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, ptr nocapture noundef readonly byval(<16 x float>) align 16 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <16 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v16f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %y = load <16 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <16 x float> %y, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 72, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %indirect-arg-temp, i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 72, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <16 x float>, align 16
+  %y = load <16 x float>, ptr %0, align 16, !tbaa !4
+  store <16 x float> %y, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(<16 x float>) align 16 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @v16f32_i32(ptr nocapture noundef readonly byval(<16 x float>) align 16 %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(ptr nocapture noundef readonly byval(<16 x float>) align 16 %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <16 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %v16f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <16 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <16 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 64, i1 false)
+; CHECK-NEXT:    %2 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <16 x float>, align 16
+  %x = load <16 x float>, ptr %0, align 16, !tbaa !4
+  store <16 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<16 x float>) align 16 %indirect-arg-temp, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v32f32(i32 noundef %x, ptr nocapture noundef readonly byval(<32 x float>) align 16 %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, ptr nocapture noundef readonly byval(<32 x float>) align 16 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v32f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %y = load <32 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <32 x float> %y, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 136, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %indirect-arg-temp, i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 136, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 16
+  %y = load <32 x float>, ptr %0, align 16, !tbaa !4
+  store <32 x float> %y, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(<32 x float>) align 16 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @v32f32_i32(ptr nocapture noundef readonly byval(<32 x float>) align 16 %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(ptr nocapture noundef readonly byval(<32 x float>) align 16 %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %v32f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <32 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <32 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 128, i1 false)
+; CHECK-NEXT:    %2 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 16
+  %x = load <32 x float>, ptr %0, align 16, !tbaa !4
+  store <32 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<32 x float>) align 16 %indirect-arg-temp, i32 noundef %y) #2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { mustprogress }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/CodeGen/X86/expand-variadic-call-x64-linux.ll b/llvm/test/CodeGen/X86/expand-variadic-call-x64-linux.ll
new file mode 100644
index 00000000000000..c87f5844ecf80c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/expand-variadic-call-x64-linux.ll
@@ -0,0 +1,688 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Check the variables are lowered to the locations this target expects
+
+; The types show the call frames
+; CHECK: %single_i32.vararg = type <{ i32 }>
+; CHECK: %single_double.vararg = type <{ double }>
+; CHECK: %single_v4f32.vararg = type <{ <4 x float> }>
+; CHECK: %single_v8f32.vararg = type <{ <8 x float> }>
+; CHECK: %single_v16f32.vararg = type <{ <16 x float> }>
+; CHECK: %single_v32f32.vararg = type <{ <32 x float> }>
+; CHECK: %i32_double.vararg = type <{ i32, [4 x i8], double }>
+; CHECK: %double_i32.vararg = type <{ double, i32 }>
+; CHECK: %i32_v4f32.vararg = type <{ i32, [4 x i8], <4 x float> }>
+; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }>
+; CHECK: %i32_v8f32.vararg = type <{ i32, [4 x i8], <8 x float> }>
+; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }>
+; CHECK: %i32_v16f32.vararg = type <{ i32, [4 x i8], <16 x float> }>
+; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }>
+; CHECK: %i32_v32f32.vararg = type <{ i32, [4 x i8], <32 x float> }>
+; CHECK: %v32f32_i32.vararg = type <{ <32 x float>, i32 }>
+
+%struct.__va_list_tag = type { i32, i32, ptr, ptr }
+%struct.libcS = type { i8, i16, i32, i64, float, double }
+
+define void @copy(ptr noundef %va) {
+; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %cp = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %cp, ptr %va, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %cp) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cp = alloca [1 x %struct.__va_list_tag], align 16
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %cp) #2
+  call void @llvm.va_copy.p0(ptr nonnull %cp, ptr %va)
+  call void @valist(ptr noundef nonnull %cp) #2
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %cp) #2
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+
+declare void @llvm.va_copy.p0(ptr, ptr) #1
+
+declare void @valist(ptr noundef)
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+define void @start_once(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_once(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %s, ptr %varargs, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %s) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s = alloca [1 x %struct.__va_list_tag], align 16
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s) #2
+  call void @llvm.va_start.p0(ptr nonnull %s)
+  call void @valist(ptr noundef nonnull %s) #2
+  call void @llvm.va_end.p0(ptr %s)
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s) #2
+  ret void
+}
+
+declare void @llvm.va_start.p0(ptr) #1
+
+declare void @llvm.va_end.p0(ptr) #1
+
+define void @start_twice(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s0 = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    %s1 = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %s0, ptr %varargs, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %s1, ptr %varargs, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s0) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s0 = alloca [1 x %struct.__va_list_tag], align 16
+  %s1 = alloca [1 x %struct.__va_list_tag], align 16
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s0) #2
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s1) #2
+  call void @llvm.va_start.p0(ptr nonnull %s0)
+  call void @valist(ptr noundef nonnull %s0) #2
+  call void @llvm.va_end.p0(ptr %s0)
+  call void @llvm.va_start.p0(ptr nonnull %s1)
+  call void @valist(ptr noundef nonnull %s1) #2
+  call void @llvm.va_end.p0(ptr %s1)
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s1) #2
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s0) #2
+  ret void
+}
+
+define void @single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x) #2
+  ret void
+}
+
+declare void @vararg(...)
+
+define void @single_double(double noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_double.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x) #2
+  ret void
+}
+
+define void @single_v4f32(<4 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v4f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v8f32(ptr nocapture noundef readonly byval(<8 x float>) align 32 %0) {
+; CHECK-LABEL: define {{[^@]+}}@single_v8f32(ptr nocapture noundef readonly byval(<8 x float>) align 32 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <8 x float>, align 32
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v8f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <8 x float>, ptr %0, align 32, !tbaa !3
+; CHECK-NEXT:    store <8 x float> %x, ptr %indirect-arg-temp, align 32, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %single_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <8 x float>, align 32
+  %x = load <8 x float>, ptr %0, align 32, !tbaa !4
+  store <8 x float> %x, ptr %indirect-arg-temp, align 32, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<8 x float>) align 32 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @single_v16f32(ptr nocapture noundef readonly byval(<16 x float>) align 64 %0) {
+; CHECK-LABEL: define {{[^@]+}}@single_v16f32(ptr nocapture noundef readonly byval(<16 x float>) align 64 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <16 x float>, align 64
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v16f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <16 x float>, ptr %0, align 64, !tbaa !3
+; CHECK-NEXT:    store <16 x float> %x, ptr %indirect-arg-temp, align 64, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %single_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <16 x float>, align 64
+  %x = load <16 x float>, ptr %0, align 64, !tbaa !4
+  store <16 x float> %x, ptr %indirect-arg-temp, align 64, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<16 x float>) align 64 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @single_v32f32(ptr nocapture noundef readonly byval(<32 x float>) align 128 %0) {
+; CHECK-LABEL: define {{[^@]+}}@single_v32f32(ptr nocapture noundef readonly byval(<32 x float>) align 128 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 128
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v32f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <32 x float>, ptr %0, align 128, !tbaa !3
+; CHECK-NEXT:    store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %single_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 128
+  %x = load <32 x float>, ptr %0, align 128, !tbaa !4
+  store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<32 x float>) align 128 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_double.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store double %y, ptr %1, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, double noundef %y) #2
+  ret void
+}
+
+define void @double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %double_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    %1 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 8 %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 8 %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_libcS.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %y, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(%struct.libcS) align 8 %y) #2
+  ret void
+}
+
+define void @libcS_i32(ptr noundef byval(%struct.libcS) align 8 %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@libcS_i32(ptr noundef byval(%struct.libcS) align 8 %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %libcS_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %0, ptr %x, i64 32, i1 false)
+; CHECK-NEXT:    %1 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(ptr noundef nonnull byval(%struct.libcS) align 8 %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v4f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <4 x float> %y, ptr %1, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <4 x float> noundef %y) #2
+  ret void
+}
+
+define void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v4f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    %1 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v8f32(i32 noundef %x, ptr nocapture noundef readonly byval(<8 x float>) align 32 %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, ptr nocapture noundef readonly byval(<8 x float>) align 32 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <8 x float>, align 32
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v8f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %y = load <8 x float>, ptr %0, align 32, !tbaa !3
+; CHECK-NEXT:    store <8 x float> %y, ptr %indirect-arg-temp, align 32, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %indirect-arg-temp, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <8 x float>, align 32
+  %y = load <8 x float>, ptr %0, align 32, !tbaa !4
+  store <8 x float> %y, ptr %indirect-arg-temp, align 32, !tbaa !4
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(<8 x float>) align 32 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @v8f32_i32(ptr nocapture noundef readonly byval(<8 x float>) align 32 %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(ptr nocapture noundef readonly byval(<8 x float>) align 32 %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <8 x float>, align 32
+; CHECK-NEXT:    %vararg_buffer = alloca %v8f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <8 x float>, ptr %0, align 32, !tbaa !3
+; CHECK-NEXT:    store <8 x float> %x, ptr %indirect-arg-temp, align 32, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 32, i1 false)
+; CHECK-NEXT:    %2 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <8 x float>, align 32
+  %x = load <8 x float>, ptr %0, align 32, !tbaa !4
+  store <8 x float> %x, ptr %indirect-arg-temp, align 32, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<8 x float>) align 32 %indirect-arg-temp, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v16f32(i32 noundef %x, ptr nocapture noundef readonly byval(<16 x float>) align 64 %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, ptr nocapture noundef readonly byval(<16 x float>) align 64 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <16 x float>, align 64
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v16f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %y = load <16 x float>, ptr %0, align 64, !tbaa !3
+; CHECK-NEXT:    store <16 x float> %y, ptr %indirect-arg-temp, align 64, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 72, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %indirect-arg-temp, i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 72, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <16 x float>, align 64
+  %y = load <16 x float>, ptr %0, align 64, !tbaa !4
+  store <16 x float> %y, ptr %indirect-arg-temp, align 64, !tbaa !4
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(<16 x float>) align 64 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @v16f32_i32(ptr nocapture noundef readonly byval(<16 x float>) align 64 %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(ptr nocapture noundef readonly byval(<16 x float>) align 64 %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <16 x float>, align 64
+; CHECK-NEXT:    %vararg_buffer = alloca %v16f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <16 x float>, ptr %0, align 64, !tbaa !3
+; CHECK-NEXT:    store <16 x float> %x, ptr %indirect-arg-temp, align 64, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 64, i1 false)
+; CHECK-NEXT:    %2 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <16 x float>, align 64
+  %x = load <16 x float>, ptr %0, align 64, !tbaa !4
+  store <16 x float> %x, ptr %indirect-arg-temp, align 64, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<16 x float>) align 64 %indirect-arg-temp, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v32f32(i32 noundef %x, ptr nocapture noundef readonly byval(<32 x float>) align 128 %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, ptr nocapture noundef readonly byval(<32 x float>) align 128 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 128
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v32f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %y = load <32 x float>, ptr %0, align 128, !tbaa !3
+; CHECK-NEXT:    store <32 x float> %y, ptr %indirect-arg-temp, align 128, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 136, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %indirect-arg-temp, i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 136, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 128
+  %y = load <32 x float>, ptr %0, align 128, !tbaa !4
+  store <32 x float> %y, ptr %indirect-arg-temp, align 128, !tbaa !4
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(<32 x float>) align 128 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @v32f32_i32(ptr nocapture noundef readonly byval(<32 x float>) align 128 %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(ptr nocapture noundef readonly byval(<32 x float>) align 128 %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 128
+; CHECK-NEXT:    %vararg_buffer = alloca %v32f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <32 x float>, ptr %0, align 128, !tbaa !3
+; CHECK-NEXT:    store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 128, i1 false)
+; CHECK-NEXT:    %2 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 128
+  %x = load <32 x float>, ptr %0, align 128, !tbaa !4
+  store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<32 x float>) align 128 %indirect-arg-temp, i32 noundef %y) #2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { mustprogress }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 51fb93daa4dfa6..54c593c98ab669 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -125,6 +125,7 @@
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 064362eabbf839..fa9c0cc7708e60 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -61,6 +61,7 @@
 ; CHECK-O-NEXT: Running analysis: TypeBasedAA
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-PRELINK-O-NEXT: Running analysis: ProfileSummaryAnalysis
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 19a44867e434ac..8bd372f4c06910 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -50,6 +50,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index ac80a31d8fd4bc..dd99340e60c0d3 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -58,7 +58,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass on foo
-
+; CHECK-O-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 6486639e07b49c..d2a53cea68c37b 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -92,6 +92,7 @@
 ; CHECK-O-NEXT: Running analysis: TypeBasedAA
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index 09f9f0f48baddb..cc110b01fb2445 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -83,6 +83,7 @@
 ; CHECK-O-NEXT: Running pass: PGOIndirectCallPromotion on
 ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
 ; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis on foo
+; CHECK-O-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 47bdbfd2d357d4..d2c547e04085e3 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -63,6 +63,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass on foo
+; CHECK-O-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
diff --git a/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-outliner.ll b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-outliner.ll
new file mode 100644
index 00000000000000..59d8ceed14f64d
--- /dev/null
+++ b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-outliner.ll
@@ -0,0 +1,86 @@
+; RUN: opt -mtriple=i386-unknown-linux-gnu -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s -check-prefix=X86
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s -check-prefix=X64
+
+
+declare void @llvm.va_start(ptr)
+declare void @llvm.va_end(ptr)
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
+
+declare void @sink_valist(ptr)
+declare void @sink_i32(i32)
+
+%struct.__va_list_tag = type { i32, i32, ptr, ptr }
+
+;; Simple function is split into two functions
+; X86-LABEL: define internal void @x86_non_inlinable.valist(
+; X86:       entry:
+; X86:       %va = alloca ptr, align 4
+; X86:       call void @sink_i32(i32 0)
+; X86:       store ptr %varargs, ptr %va, align 4
+; X86:       %0 = load ptr, ptr %va, align 4
+; X86:       call void @sink_valist(ptr noundef %0)
+; X86:       ret void
+; X86:     }
+; X86-LABEL: define void @x86_non_inlinable(
+; X86:       entry:
+; X86:       %va_list = alloca ptr, align 4
+; X86:       call void @llvm.va_start.p0(ptr %va_list)
+; X86:       tail call void @x86_non_inlinable.valist(ptr %va_list)
+; X86:       ret void
+; X86:       }
+define void @x86_non_inlinable(...)  {
+entry:
+  %va = alloca ptr, align 4
+  call void @sink_i32(i32 0)
+  call void @llvm.va_start.p0(ptr nonnull %va)
+  %0 = load ptr, ptr %va, align 4
+  call void @sink_valist(ptr noundef %0)
+  ret void
+}
+
+; TODO: This needs checks too
+define void @x86_caller(i32 %x) {
+  call void (...) @x86_non_inlinable(i32 %x)
+  ret void
+}
+
+
+;; As above, but for x64 - the different va_list type means a missing load.
+; X64-LABEL: define internal void @x64_non_inlinable.valist(
+; X64:       entry:
+; X64:       %va = alloca [1 x %struct.__va_list_tag], align 16
+; X64:       call void @sink_i32(i32 0)
+; X64:       call void @llvm.memcpy.inline.p0.p0.i32(ptr %va, ptr %varargs, i32 24, i1 false)
+; X64:       call void @sink_valist(ptr noundef %va)
+; X64:       ret void
+; X64:     }
+; X64-LABEL: define void @x64_non_inlinable(
+; X64:       entry:
+; X64:       %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; X64:       call void @llvm.va_start.p0(ptr %va_list)
+; X64:       tail call void @x64_non_inlinable.valist(ptr %va_list)
+; X64:       ret void
+; X64:       }
+define void @x64_non_inlinable(...)  {
+entry:
+  %va = alloca [1 x %struct.__va_list_tag], align 16
+  call void @sink_i32(i32 0)
+  call void @llvm.va_start.p0(ptr nonnull %va)
+  call void @sink_valist(ptr noundef %va)
+  ret void
+}
+
+
+; TODO: Is unchanged
+define void @no_known_callers(...)  {
+entry:
+  %va = alloca ptr, align 4
+  call void @sink_i32(i32 0)
+  call void @llvm.va_start.p0(ptr nonnull %va)
+  %0 = load ptr, ptr %va, align 4
+  call void @sink_valist(ptr noundef %0)
+  ret void
+}
+
diff --git a/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-linkage.ll b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-linkage.ll
new file mode 100644
index 00000000000000..9e96bfaccd4c39
--- /dev/null
+++ b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-linkage.ll
@@ -0,0 +1,225 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -mtriple=i386-unknown-linux-gnu -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s --check-prefixes=OPT
+; RUN: opt -mtriple=i386-unknown-linux-gnu -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s --check-prefixes=ABI
+
+; Split variadic functions into two functions:
+; - one equivalent to the original, same symbol etc
+; - one implementing the contents of the original but taking a valist
+; IR here is applicable to any target that uses a ptr for valist
+;
+; Defines a function with each linkage (in the order of the llvm documentation).
+; If split applies it does the same transform to each.
+; Whether split applies depends on whether the ABI is being changed or not - e.g. a weak
+; function is not normally useful to split as the contents cannot be called from elsewhere.
+; If the ABI is being rewritten then the function is still converted. Call sites tested elsewhere.
+
+; Update test checks doesn't emit checks for declares
+
+declare void @sink_valist(ptr)
+declare void @llvm.va_start(ptr)
+declare void @llvm.va_end(ptr)
+
+declare void @decl_simple(...)
+define void @defn_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_simple(...) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    %va_list = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; OPT-NEXT:    tail call void @defn_simple.valist(ptr %va_list)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+; no declare for private
+define private void @defn_private_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_private_simple(...) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    %va_list = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; OPT-NEXT:    tail call void @defn_private_simple.valist(ptr %va_list)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_private_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+; no declare for internal
+define internal void @defn_internal_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_internal_simple(...) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    %va_list = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; OPT-NEXT:    tail call void @defn_internal_simple.valist(ptr %va_list)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_internal_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+; no declare for available_externally
+define available_externally void @available_externally_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@available_externally_simple(...) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    %va_list = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; OPT-NEXT:    tail call void @available_externally_simple.valist(ptr %va_list)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@available_externally_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+; no declare for linkonce
+define linkonce void @defn_linkonce_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_linkonce_simple(...) {
+; OPT-NEXT:    %va = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va)
+; OPT-NEXT:    call void @sink_valist(ptr %va)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_linkonce_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+; no declare for weak
+define weak void @defn_weak_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_weak_simple(...) {
+; OPT-NEXT:    %va = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va)
+; OPT-NEXT:    call void @sink_valist(ptr %va)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_weak_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+; common is not applicable to functions
+; appending is not applicable to functions
+
+declare extern_weak void @decl_extern_weak_simple(...)
+; no define for extern_weak
+
+; no declare for linkonce_odr
+define linkonce_odr void @defn_linkonce_odr_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_linkonce_odr_simple(...) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    %va_list = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; OPT-NEXT:    tail call void @defn_linkonce_odr_simple.valist(ptr %va_list)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_linkonce_odr_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+; no declare for weak_odr
+define weak_odr void @defn_weak_odr_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_weak_odr_simple(...) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    %va_list = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; OPT-NEXT:    tail call void @defn_weak_odr_simple.valist(ptr %va_list)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_weak_odr_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+declare external void @decl_external_simple(...)
+define external void @defn_external_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_external_simple(...) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    %va_list = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; OPT-NEXT:    tail call void @defn_external_simple.valist(ptr %va_list)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_external_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+
+
diff --git a/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll
new file mode 100644
index 00000000000000..2866aade4813bb
--- /dev/null
+++ b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll
@@ -0,0 +1,121 @@
+; RUN: opt -mtriple=i386-unknown-linux-gnu -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s
+
+; i386 uses a void* for va_arg
+; amdgpu should be the same codegen, nvptx slightly different alignment on the va_arg
+
+; Examples are variadic functions that return the first or the second of an int and a double
+; Split the functions into an internal equivalent that takes a va_list and a ABI preserving wrapper
+
+define i32 @variadic_int_double_get_firstz(...) {
+entry:
+  %va = alloca ptr, align 4
+  call void @llvm.va_start.p0(ptr nonnull %va)
+  %argp.cur = load ptr, ptr %va, align 4
+  %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+  store ptr %argp.next, ptr %va, align 4
+  %0 = load i32, ptr %argp.cur, align 4
+  call void @llvm.va_end.p0(ptr %va)
+  ret i32 %0
+}
+
+; CHECK-LABEL: define internal i32 @variadic_int_double_get_firstz.valist(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:   %va = alloca ptr, align 4
+; CHECK-NEXT:   store ptr %varargs, ptr %va, align 4
+; CHECK-NEXT:   %argp.cur = load ptr, ptr %va, align 4
+; CHECK-NEXT:   %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+; CHECK-NEXT:   store ptr %argp.next, ptr %va, align 4
+; CHECK-NEXT:   %0 = load i32, ptr %argp.cur, align 4
+; CHECK-NEXT:   ret i32 %0
+; CHECK-NEXT:  }
+
+; CHECK-LABEL: define i32 @variadic_int_double_get_firstz(...) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va_list = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; CHECK-NEXT:    %0 = tail call i32 @variadic_int_double_get_firstz.valist(ptr %va_list)
+; CHECK-NEXT:    ret i32 %0
+; CHECK-NEXT:  }
+
+define double @variadic_int_double_get_secondz(...) {
+entry:
+  %va = alloca ptr, align 4
+  call void @llvm.va_start.p0(ptr nonnull %va)
+  %argp.cur = load ptr, ptr %va, align 4
+  %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+  %argp.next2 = getelementptr inbounds i8, ptr %argp.cur, i32 12
+  store ptr %argp.next2, ptr %va, align 4
+  %0 = load double, ptr %argp.next, align 4
+  call void @llvm.va_end.p0(ptr %va)
+  ret double %0
+}
+
+; CHECK-LABEL: define internal double @variadic_int_double_get_secondz.valist(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va = alloca ptr, align 4
+; CHECK-NEXT:    store ptr %varargs, ptr %va, align 4
+; CHECK-NEXT:    %argp.cur = load ptr, ptr %va, align 4
+; CHECK-NEXT:    %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+; CHECK-NEXT:    %argp.next2 = getelementptr inbounds i8, ptr %argp.cur, i32 12
+; CHECK-NEXT:    store ptr %argp.next2, ptr %va, align 4
+; CHECK-NEXT:    %0 = load double, ptr %argp.next, align 4
+; CHECK-NEXT:    ret double %0
+; CHECK-NEXT:  }
+
+; CHECK-LABEL: define double @variadic_int_double_get_secondz(...) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va_list = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; CHECK-NEXT:    %0 = tail call double @variadic_int_double_get_secondz.valist(ptr %va_list)
+; CHECK-NEXT:    ret double %0
+; CHECK-NEXT:  }
+
+
+; CHECK-LABEL: @variadic_can_get_firstIidEEbT_T0_(i32 %x, double %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %variadic_can_get_firstIidEEbT_T0_.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %variadic_can_get_firstIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %variadic_can_get_firstIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store double %y, ptr %1, align 4
+; CHECK-NEXT:    %call = call i32 @variadic_int_double_get_firstz.valist(ptr %vararg_buffer)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %cmp.i = icmp eq i32 %call, %x
+; CHECK-NEXT:    ret i1 %cmp.i
+; CHECK-NEXT:  }
+
+define zeroext i1 @variadic_can_get_firstIidEEbT_T0_(i32 %x, double %y) {
+entry:
+  %call = call i32 (...) @variadic_int_double_get_firstz(i32 %x, double %y)
+  %cmp.i = icmp eq i32 %call, %x
+  ret i1 %cmp.i
+}
+
+; CHECK-LABEL: @variadic_can_get_secondIidEEbT_T0_(i32 %x, double %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %variadic_can_get_secondIidEEbT_T0_.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %variadic_can_get_secondIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %variadic_can_get_secondIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store double %y, ptr %1, align 4
+; CHECK-NEXT:    %call = call double @variadic_int_double_get_secondz.valist(ptr %vararg_buffer)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %cmp.i = fcmp oeq double %call, %y
+; CHECK-NEXT:    ret i1 %cmp.i
+; CHECK-NEXT:  }
+
+define zeroext i1 @variadic_can_get_secondIidEEbT_T0_(i32 %x, double %y) {
+entry:
+  %call = call double (...) @variadic_int_double_get_secondz(i32 %x, double %y)
+  %cmp.i = fcmp oeq double %call, %y
+  ret i1 %cmp.i
+}
+
+; Declaration unchanged
+; CHECK: declare void @variadic_without_callers(...)
+declare void @variadic_without_callers(...)
+
+declare void @llvm.va_start.p0(ptr)
+declare void @llvm.va_end.p0(ptr)
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn
index 0d134c7bdffb70..bcf2ea7510568d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn
@@ -33,6 +33,7 @@ static_library("IPO") {
     "DeadArgumentElimination.cpp",
     "ElimAvailExtern.cpp",
     "EmbedBitcodePass.cpp",
+    "ExpandVariadics.cpp",
     "ExtractGV.cpp",
     "ForceFunctionAttrs.cpp",
     "FunctionAttrs.cpp",



More information about the cfe-commits mailing list