[clang] [llvm] [WIP] Expand variadic functions in IR (PR #89007)

Jon Chesterfield via cfe-commits cfe-commits at lists.llvm.org
Thu Apr 18 14:22:36 PDT 2024


https://github.com/JonChesterfield updated https://github.com/llvm/llvm-project/pull/89007

>From cb3aba6c8da4697c58b988e63f25cdadc479aaa8 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfield at gmail.com>
Date: Wed, 17 Apr 2024 00:32:12 +0100
Subject: [PATCH 1/3] [WIP] Expand variadic functions in IR

---
 clang/lib/CodeGen/ABIInfoImpl.cpp             |   13 +-
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |    8 +-
 clang/lib/CodeGen/Targets/NVPTX.cpp           |    8 +-
 clang/test/CodeGen/expand-variadic-call.c     |  314 +++++
 clang/test/CodeGen/variadic-wrapper-removal.c |   85 ++
 .../CodeGenCXX/inline-then-fold-variadics.cpp |  247 ++++
 llvm/include/llvm/InitializePasses.h          |    1 +
 .../llvm/Transforms/IPO/ExpandVariadics.h     |   43 +
 llvm/lib/Passes/PassBuilder.cpp               |    1 +
 llvm/lib/Passes/PassBuilderPipelines.cpp      |    4 +
 llvm/lib/Passes/PassRegistry.def              |    1 +
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |    1 +
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |    3 +
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp  |    4 +
 llvm/lib/Transforms/IPO/CMakeLists.txt        |    1 +
 llvm/lib/Transforms/IPO/ExpandVariadics.cpp   | 1056 +++++++++++++++++
 .../CodeGen/AMDGPU/expand-variadic-call.ll    |  499 ++++++++
 llvm/test/CodeGen/AMDGPU/llc-pipeline.ll      |    5 +
 llvm/test/CodeGen/AMDGPU/unsupported-calls.ll |   19 -
 .../CodeGen/NVPTX/expand-variadic-call.ll     |  468 ++++++++
 .../X86/expand-variadic-call-i386-darwin.ll   |  449 +++++++
 .../X86/expand-variadic-call-i386-linux.ll    |  449 +++++++
 .../X86/expand-variadic-call-i686-msvc.ll     |  467 ++++++++
 .../X86/expand-variadic-call-x64-darwin.ll    |  688 +++++++++++
 .../X86/expand-variadic-call-x64-linux.ll     |  688 +++++++++++
 llvm/test/Other/new-pm-defaults.ll            |    1 +
 .../Other/new-pm-thinlto-postlink-defaults.ll |    1 +
 .../new-pm-thinlto-postlink-pgo-defaults.ll   |    1 +
 ...-pm-thinlto-postlink-samplepgo-defaults.ll |    2 +-
 .../Other/new-pm-thinlto-prelink-defaults.ll  |    1 +
 .../new-pm-thinlto-prelink-pgo-defaults.ll    |    1 +
 ...w-pm-thinlto-prelink-samplepgo-defaults.ll |    1 +
 .../expand-va-intrinsic-outliner.ll           |   86 ++
 .../expand-va-intrinsic-split-linkage.ll      |  225 ++++
 .../expand-va-intrinsic-split-simple.ll       |  121 ++
 .../llvm/lib/Transforms/IPO/BUILD.gn          |    1 +
 36 files changed, 5939 insertions(+), 24 deletions(-)
 create mode 100644 clang/test/CodeGen/expand-variadic-call.c
 create mode 100644 clang/test/CodeGen/variadic-wrapper-removal.c
 create mode 100644 clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
 create mode 100644 llvm/include/llvm/Transforms/IPO/ExpandVariadics.h
 create mode 100644 llvm/lib/Transforms/IPO/ExpandVariadics.cpp
 create mode 100644 llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/expand-variadic-call.ll
 create mode 100644 llvm/test/CodeGen/X86/expand-variadic-call-i386-darwin.ll
 create mode 100644 llvm/test/CodeGen/X86/expand-variadic-call-i386-linux.ll
 create mode 100644 llvm/test/CodeGen/X86/expand-variadic-call-i686-msvc.ll
 create mode 100644 llvm/test/CodeGen/X86/expand-variadic-call-x64-darwin.ll
 create mode 100644 llvm/test/CodeGen/X86/expand-variadic-call-x64-linux.ll
 create mode 100644 llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-outliner.ll
 create mode 100644 llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-linkage.ll
 create mode 100644 llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll

diff --git a/clang/lib/CodeGen/ABIInfoImpl.cpp b/clang/lib/CodeGen/ABIInfoImpl.cpp
index 3e34d82cb399ba..c4a1829ab8863f 100644
--- a/clang/lib/CodeGen/ABIInfoImpl.cpp
+++ b/clang/lib/CodeGen/ABIInfoImpl.cpp
@@ -154,11 +154,20 @@ llvm::Value *CodeGen::emitRoundPointerUpToAlignment(CodeGenFunction &CGF,
                                                     llvm::Value *Ptr,
                                                     CharUnits Align) {
   // OverflowArgArea = (OverflowArgArea + Align - 1) & -Align;
+  Ptr = CGF.Builder.CreateAddrSpaceCast(Ptr, CGF.AllocaInt8PtrTy,
+                                        Ptr->getName() + ".addrcast");
   llvm::Value *RoundUp = CGF.Builder.CreateConstInBoundsGEP1_32(
       CGF.Builder.getInt8Ty(), Ptr, Align.getQuantity() - 1);
+
+  // ptrmask is sensitive to the bitwidth of the mask
+  unsigned IndexTypeSize =
+      CGF.CGM.getDataLayout().getIndexTypeSizeInBits(RoundUp->getType());
+  llvm::IntegerType *MaskType =
+      llvm::IntegerType::get(CGF.getLLVMContext(), IndexTypeSize);
+
   return CGF.Builder.CreateIntrinsic(
-      llvm::Intrinsic::ptrmask, {CGF.AllocaInt8PtrTy, CGF.IntPtrTy},
-      {RoundUp, llvm::ConstantInt::get(CGF.IntPtrTy, -Align.getQuantity())},
+      llvm::Intrinsic::ptrmask, {CGF.AllocaInt8PtrTy, MaskType},
+      {RoundUp, llvm::ConstantInt::get(MaskType, -Align.getQuantity())},
       nullptr, Ptr->getName() + ".aligned");
 }
 
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 44e86c0b40f686..e274741a6ee652 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -115,7 +115,13 @@ void AMDGPUABIInfo::computeInfo(CGFunctionInfo &FI) const {
 
 Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
                                  QualType Ty) const {
-  llvm_unreachable("AMDGPU does not support varargs");
+  const bool IsIndirect = false;
+  const bool AllowHigherAlign = true;
+  // Would rather not naturally align values
+  // Splitting {char, short} into two separate arguments makes that difficult.
+  return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
+                          getContext().getTypeInfoInChars(Ty),
+                          CharUnits::fromQuantity(1), AllowHigherAlign);
 }
 
 ABIArgInfo AMDGPUABIInfo::classifyReturnType(QualType RetTy) const {
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 7dce5042c3dc20..d6c256e5e6f7b9 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -215,7 +215,13 @@ void NVPTXABIInfo::computeInfo(CGFunctionInfo &FI) const {
 
 Address NVPTXABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
                                 QualType Ty) const {
-  llvm_unreachable("NVPTX does not support varargs");
+  // TODO: Work out to what extent this correlates with ptx
+  // doubles get passed with 8 byte alignment and C promotes smaller integer
+  // types to int. Printf doesn't really do structs so hard to guess what
+  // the right thing is for that.
+  return emitVoidPtrVAArg(CGF, VAListAddr, Ty, false,
+                          getContext().getTypeInfoInChars(Ty),
+                          CharUnits::fromQuantity(4), true);
 }
 
 void NVPTXTargetCodeGenInfo::setTargetAttributes(
diff --git a/clang/test/CodeGen/expand-variadic-call.c b/clang/test/CodeGen/expand-variadic-call.c
new file mode 100644
index 00000000000000..baff54544207ea
--- /dev/null
+++ b/clang/test/CodeGen/expand-variadic-call.c
@@ -0,0 +1,314 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -target-cpu x86-64-v4 -std=c23 -O1 -ffreestanding -mllvm --expand-variadics-override=disable -emit-llvm -o - %s | FileCheck %s
+
+// This test sanity checks calling a variadic function with the expansion transform disabled.
+// The IR test cases {arch}/expand-variadic-call-*.ll correspond to IR generated from this test case.
+
+typedef __builtin_va_list va_list;
+#define va_copy(dest, src) __builtin_va_copy(dest, src)
+#define va_start(ap, ...) __builtin_va_start(ap, 0)
+#define va_end(ap) __builtin_va_end(ap)
+#define va_arg(ap, type) __builtin_va_arg(ap, type)
+
+// 32 bit x86 alignment uses getTypeStackAlign for special cases
+// Whitebox testing.
+// Needs a type >= 16 which is either a simd or a struct containing a simd
+// darwinvectorabi should force 4 bytes
+// linux vectors with align 16/32/64 return that alignment
+
+
+// Might want various copy/end style constructs in a separate test
+
+void vararg(...);
+void valist(va_list);
+
+// CHECK-LABEL: @copy(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CP:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[CP]]) #[[ATTR7:[0-9]+]]
+// CHECK-NEXT:    call void @llvm.va_copy.p0(ptr nonnull [[CP]], ptr [[VA:%.*]])
+// CHECK-NEXT:    call void @valist(ptr noundef nonnull [[CP]]) #[[ATTR8:[0-9]+]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[CP]]) #[[ATTR7]]
+// CHECK-NEXT:    ret void
+//
+void copy(va_list va)
+{
+  va_list cp;
+  va_copy(cp, va);
+  valist(cp);
+}
+
+// CHECK-LABEL: @start_once(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[S:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[S]]) #[[ATTR7]]
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[S]])
+// CHECK-NEXT:    call void @valist(ptr noundef nonnull [[S]]) #[[ATTR8]]
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[S]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[S]]) #[[ATTR7]]
+// CHECK-NEXT:    ret void
+//
+void start_once(...)
+{
+  va_list s;
+  va_start(s);
+  valist(s);
+  va_end(s);
+}
+
+// CHECK-LABEL: @start_twice(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[S0:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    [[S1:%.*]] = alloca [1 x %struct.__va_list_tag], align 16
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[S0]]) #[[ATTR7]]
+// CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[S1]]) #[[ATTR7]]
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[S0]])
+// CHECK-NEXT:    call void @valist(ptr noundef nonnull [[S0]]) #[[ATTR8]]
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[S0]])
+// CHECK-NEXT:    call void @llvm.va_start.p0(ptr nonnull [[S1]])
+// CHECK-NEXT:    call void @valist(ptr noundef nonnull [[S1]]) #[[ATTR8]]
+// CHECK-NEXT:    call void @llvm.va_end.p0(ptr [[S1]])
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[S1]]) #[[ATTR7]]
+// CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull [[S0]]) #[[ATTR7]]
+// CHECK-NEXT:    ret void
+//
+void start_twice(...)
+{
+  va_list s0,s1;
+  va_start(s0);
+  valist(s0);
+  va_end(s0);
+  va_start(s1);
+  valist(s1);
+  va_end(s1);
+}
+
+// vectors with alignment 16/32/64 are natively aligned on linux x86
+// v32f32 would be a m1024 type, larger than x64 defines at time of writing
+typedef int i32;
+typedef float v4f32 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef float v8f32 __attribute__((__vector_size__(32), __aligned__(32)));
+typedef float v16f32 __attribute__((__vector_size__(64), __aligned__(64)));
+typedef float v32f32 __attribute__((__vector_size__(128), __aligned__(128)));
+
+
+// Pass a single value to wrapped() via vararg(...)
+// CHECK-LABEL: @single_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(i32 noundef [[X:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void single_i32(i32 x)
+{
+  vararg(x);
+}
+
+
+// CHECK-LABEL: @single_double(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(double noundef [[X:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void single_double(double x)
+{
+  vararg(x);
+}
+
+// CHECK-LABEL: @single_v4f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(<4 x float> noundef [[X:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void single_v4f32(v4f32 x)
+{
+  vararg(x);
+}
+
+// CHECK-LABEL: @single_v8f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(<8 x float> noundef [[X:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void single_v8f32(v8f32 x)
+{
+  vararg(x);
+}
+
+// CHECK-LABEL: @single_v16f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(<16 x float> noundef [[X:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void single_v16f32(v16f32 x)
+{
+  vararg(x);
+}
+
+// CHECK-LABEL: @single_v32f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca <32 x float>, align 128
+// CHECK-NEXT:    [[X:%.*]] = load <32 x float>, ptr [[TMP0:%.*]], align 128, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    store <32 x float> [[X]], ptr [[INDIRECT_ARG_TEMP]], align 128, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void (...) @vararg(ptr noundef nonnull byval(<32 x float>) align 128 [[INDIRECT_ARG_TEMP]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void single_v32f32(v32f32 x)
+{
+  vararg(x);
+}
+
+
+
+// CHECK-LABEL: @i32_double(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(i32 noundef [[X:%.*]], double noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void i32_double(i32 x, double y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @double_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(double noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void double_i32(double x, i32 y)
+{
+  vararg(x, y);
+}
+
+
+// A struct used by libc variadic tests
+
+typedef struct {
+  char c;
+  short s;
+  int i;
+  long l;
+  float f;
+  double d;
+}  libcS;
+
+// CHECK-LABEL: @i32_libcS(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(i32 noundef [[X:%.*]], ptr noundef nonnull byval([[STRUCT_LIBCS:%.*]]) align 8 [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void i32_libcS(i32 x, libcS y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @libcS_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(ptr noundef nonnull byval([[STRUCT_LIBCS:%.*]]) align 8 [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void libcS_i32(libcS x, i32 y)
+{
+  vararg(x, y);
+}
+
+
+// CHECK-LABEL: @i32_v4f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(i32 noundef [[X:%.*]], <4 x float> noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void i32_v4f32(i32 x, v4f32 y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @v4f32_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(<4 x float> noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void v4f32_i32(v4f32 x, i32 y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @i32_v8f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(i32 noundef [[X:%.*]], <8 x float> noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void i32_v8f32(i32 x, v8f32 y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @v8f32_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(<8 x float> noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void v8f32_i32(v8f32 x, i32 y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @i32_v16f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(i32 noundef [[X:%.*]], <16 x float> noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void i32_v16f32(i32 x, v16f32 y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @v16f32_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    tail call void (...) @vararg(<16 x float> noundef [[X:%.*]], i32 noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void v16f32_i32(v16f32 x, i32 y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @i32_v32f32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca <32 x float>, align 128
+// CHECK-NEXT:    [[Y:%.*]] = load <32 x float>, ptr [[TMP0:%.*]], align 128, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x float> [[Y]], ptr [[INDIRECT_ARG_TEMP]], align 128, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void (...) @vararg(i32 noundef [[X:%.*]], ptr noundef nonnull byval(<32 x float>) align 128 [[INDIRECT_ARG_TEMP]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void i32_v32f32(i32 x, v32f32 y)
+{
+  vararg(x, y);
+}
+
+// CHECK-LABEL: @v32f32_i32(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca <32 x float>, align 128
+// CHECK-NEXT:    [[X:%.*]] = load <32 x float>, ptr [[TMP0:%.*]], align 128, !tbaa [[TBAA2]]
+// CHECK-NEXT:    store <32 x float> [[X]], ptr [[INDIRECT_ARG_TEMP]], align 128, !tbaa [[TBAA2]]
+// CHECK-NEXT:    tail call void (...) @vararg(ptr noundef nonnull byval(<32 x float>) align 128 [[INDIRECT_ARG_TEMP]], i32 noundef [[Y:%.*]]) #[[ATTR8]]
+// CHECK-NEXT:    ret void
+//
+void v32f32_i32(v32f32 x, i32 y)
+{
+  vararg(x, y);
+}
+
+#if __AMDGPU__ || __NVPTX__
+
+
+void (*volatile vararg_ptr)(...) = &vararg;
+
+void indirect_single_i32(i32 x)
+{
+  vararg_ptr(x);
+}
+
+
+#endif
diff --git a/clang/test/CodeGen/variadic-wrapper-removal.c b/clang/test/CodeGen/variadic-wrapper-removal.c
new file mode 100644
index 00000000000000..13499612f0757f
--- /dev/null
+++ b/clang/test/CodeGen/variadic-wrapper-removal.c
@@ -0,0 +1,85 @@
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu -O1 -emit-llvm -o - %s | opt --passes='module(expand-variadics,inline)' -S | FileCheck %s
+// RUN: %clang_cc1 -triple=x86_64-linux-gnu -O1 -emit-llvm -o - %s | opt --passes='module(expand-variadics,inline)' -S | FileCheck %s
+
+// neither arm arch is implemented yet, leaving it here as a reminder
+// armv6 is a ptr as far as the struct is concerned, but possibly also a [1 x i32] passed by value
+// that seems insistent, maybe leave 32 bit arm alone for now
+// aarch64 is a struct of five things passed using byval memcpy
+
+// R-N: %clang_cc1 -triple=armv6-none--eabi -O1 -emit-llvm -o - %s | opt --passes=expand-variadics -S | FileCheck %s
+// R-N: %clang_cc1 -triple=aarch64-none-linux-gnu -O1 -emit-llvm -o - %s | opt --passes=expand-variadics -S | FileCheck %s
+
+
+
+// expand-variadics rewrites calls to variadic functions into calls to
+// equivalent functions that take a va_list argument. A property of the
+// implementation is that said "equivalent function" may be a pre-existing one.
+// This is equivalent to inlining a sufficiently simple variadic wrapper.
+
+#include <stdarg.h>
+
+typedef int FILE; // close enough for this test
+
+// fprintf is sometimes implemented as a call to vfprintf. That fits the
+// pattern the transform pass recognises - given an implementation of fprintf
+// in the IR module, calls to it can be rewritten into calls into vfprintf.
+
+// CHECK-LABEL: define{{.*}} i32 @fprintf(
+// CHECK-LABEL: define{{.*}} i32 @call_fprintf(
+// CHECK-NOT:   @fprintf
+// CHECK:       @vfprintf
+int vfprintf(FILE *restrict f, const char *restrict fmt, va_list ap);
+int fprintf(FILE *restrict f, const char *restrict fmt, ...)
+{
+  int ret;
+  va_list ap;
+  va_start(ap, fmt);
+  ret = vfprintf(f, fmt, ap);
+  va_end(ap);
+  return ret;
+}
+int call_fprintf(FILE *f)
+{
+  int x = 42;
+  double y = 3.14;
+  return fprintf(f, "int %d dbl %g\n", x, y);
+}
+
+// Void return type is also OK
+
+// CHECK-LABEL: define{{.*}} void @no_result(
+// CHECK-LABEL: define{{.*}} void @call_no_result(
+// CHECK-NOT:   @no_result
+// CHECK:       @vno_result
+void vno_result(const char * fmt, va_list);
+void no_result(const char * fmt, ...)
+{
+  va_list ap;
+  va_start(ap, fmt);
+  vno_result(fmt, ap);
+  va_end(ap);
+}
+void call_no_result(FILE *f)
+{
+  int x = 101;
+  no_result("", x);
+}
+
+// The vaend in the forwarding implementation is optional where it's a no-op
+
+// CHECK-LABEL: define{{.*}} i32 @no_vaend(
+// CHECK-LABEL: define{{.*}} i32 @call_no_vaend(
+// CHECK-NOT:   @no_vaend
+// CHECK:       @vno_vaend
+int vno_vaend(int x, va_list);
+int no_vaend(int x, ...)
+{
+  va_list ap;
+  va_start(ap, x);
+  return vno_vaend(x, ap);
+}
+int call_no_vaend(int x)
+{
+  return no_vaend(x, 10, 2.5f);
+}
diff --git a/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp b/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
new file mode 100644
index 00000000000000..c7b5863b2632a6
--- /dev/null
+++ b/clang/test/CodeGenCXX/inline-then-fold-variadics.cpp
@@ -0,0 +1,247 @@
+// RUN: %clang_cc1 -triple i386-unknown-linux-gnu -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s --check-prefixes=CHECK,X86Linux
+
+
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s --check-prefixes=CHECK,X64SystemV
+
+
+// RUN: %clang_cc1 -triple i386-apple-darwin -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s --check-prefixes=CHECK,X86Darwin
+
+// RUN: %clang_cc1 -triple x86_64-apple-darwin -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s --check-prefixes=CHECK,X64SystemV
+
+
+// The clang test suite has _lots_ of windows related triples in it
+// 'x86_64-pc-windows-msvc|i686-windows-msvc|thumbv7-windows|aarch64-windows|i686-windows|x86_64-windows|x86_64-unknown-windows-msvc|i386-windows-pc|x86_64--windows-msvc|i686--windows-msvc|x86_64-unknown-windows-gnu|i686-unknown-windows-msvc|i686-unknown-windows-gnu|arm64ec-pc-windows-msvc19.20.0|i686-pc-windows-msvc19.14.0|i686-pc-windows|x86_64--windows-gnu|i686--windows-gnu|thumbv7--windows|i386-windows|x86_64-unknown-windows-pc|i686--windows|x86_64--windows|i686-w64-windows-gnu'
+
+// Might be detecting an inconsistency - maybe different alignment
+// Presently failing on an unusual calling convention
+
+// i686 windows emits suboptimal codegen. sroa removes a field from a struct which misaligns a field which blocks store/load forwarding
+// RUN: %clang_cc1 -triple i686-windows-msvc -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s --check-prefixes=CHECK,X86Windows
+
+
+// 64 bit windows va_arg passes most type indirectly but the call instruction uses the types by value
+// ___: %clang_cc1 -triple x86_64-pc-windows-msvc -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s --check-prefixes=CHECK
+
+
+
+// amdgpu emits a sequence of addrspace casts that aren't folded yet
+// todo: match it anyway
+// R-N: %clang_cc1 -triple amdgcn-amd-amdhsa -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s
+
+// Requires the instcombine patch that hasn't landed yet
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -Wno-varargs -O1 -emit-llvm -o - %s | opt --passes=expand-variadics | opt -S -O1 | FileCheck %s
+
+
+
+
+
+// Not yet implemented on arm
+// Also there are various x86 variants that should be in the triple
+
+// Checks for consistency between clang and expand-va-intrinics
+// 1. Use clang to lower va_arg
+// 2. Use expand-variadics to lower the rest of the variadic operations
+// 3. Use opt -O1 to simplify the functions to ret %arg
+// The simplification to ret %arg will fail when the two are not consistent, modulo bugs elsewhere.
+
+#include <stdarg.h>
+
+template <typename X, typename Y>
+static X first(...) {
+  va_list va;
+  __builtin_va_start(va, 0);
+  X r = va_arg(va, X);
+  va_end(va);
+  return r;
+}
+
+template <typename X, typename Y>
+static Y second(...) {
+  va_list va;
+  __builtin_va_start(va, 0);
+  va_arg(va, X);
+  Y r = va_arg(va, Y);
+  va_end(va);
+  return r;
+}
+
+typedef float float4 __attribute__((__vector_size__(16), __aligned__(16)));
+typedef float float8 __attribute__((__vector_size__(32), __aligned__(32)));
+typedef float float16 __attribute__((__vector_size__(64), __aligned__(64)));
+typedef float float32 __attribute__((__vector_size__(128), __aligned__(128)));
+
+
+extern "C"
+{
+// CHECK-LABEL: define{{.*}} i32 @first_i32_i32(i32{{.*}} %x, i32{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %x
+int first_i32_i32(int x, int y)
+{
+  return first<int,int>(x, y);
+}
+
+// CHECK-LABEL: define{{.*}} i32 @second_i32_i32(i32{{.*}} %x, i32{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %y
+int second_i32_i32(int x, int y)
+{
+  return second<int,int>(x, y);
+}
+}
+
+// Permutations of an int and a double
+extern "C"
+{
+// CHECK-LABEL: define{{.*}} i32 @first_i32_f64(i32{{.*}} %x, double{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %x
+int first_i32_f64(int x, double y)
+{
+  return first<int,double>(x, y);
+}
+  
+// CHECK-LABEL: define{{.*}} double @second_i32_f64(i32{{.*}} %x, double{{.*}} %y)
+// CHECK:       entry:
+
+// X86Linux:    ret double %y
+// X64SystemV:  ret double %y
+// X86Darwin:   ret double %y
+// X86Windows:  [[TMP0:%.*]] = alloca <{ [4 x i8], double }>, align 4
+// X86Windows:  [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
+// X86Windows:  store double %y, ptr [[TMP1]], align 4
+// X86Windows:  [[TMP2:%.*]] = load double, ptr [[TMP0]], align 4
+// X86Windows:  ret double [[TMP2]]
+double second_i32_f64(int x, double y)
+{
+  return second<int,double>(x, y);
+}
+
+// CHECK-LABEL: define{{.*}} double @first_f64_i32(double{{.*}} %x, i32{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret double %x
+double first_f64_i32(double x, int y)
+{
+  return first<double,int>(x, y);
+}
+
+// CHECK-LABEL: define{{.*}} i32 @second_f64_i32(double{{.*}} %x, i32{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %y
+int second_f64_i32(double x, int y)
+{
+  return second<double,int>(x, y);
+}   
+}
+
+
+// Permutations of an int and a float4
+extern "C"
+{
+
+// CHECK-LABEL: define{{.*}} i32 @first_i32_v4f32(i32{{.*}} %x, ptr{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %x
+int first_i32_v4f32(int x, float4 * y)
+{
+  return first<int,float4>(x, *y);
+}
+  
+// CHECK-LABEL: define{{.*}} void @second_i32_v4f32(i32{{.*}} %x, ptr{{.*}} %y, ptr{{.*}} %r)
+// CHECK:       entry:
+// X86Linux:    [[TMP0:%.*]] = load <4 x float>, ptr %y, align 16
+// X86Linux:    store <4 x float> [[TMP0]], ptr %r, align 16
+// X64SystemV:  [[TMP0:%.*]] = load <4 x float>, ptr %y, align 16
+// X64SystemV:  store <4 x float> [[TMP0]], ptr %r, align 16
+// X86Darwin:   [[TMP0:%.*]] = load <2 x i64>, ptr %y, align 16
+// X86Darwin:   store <2 x i64> [[TMP0]], ptr %r, align 16
+// X86Windows:  [[TMP0:%.*]] = alloca <{ [12 x i8], <4 x float> }>, align 4
+// X86Windows:  [[TMP1:%.*]] = load <4 x float>, ptr %y, align 16
+// X86Windows:  [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 12
+// X86Windows:  store <4 x float> [[TMP1]], ptr [[TMP2]], align 4
+// X86Windows:  [[TMP3:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
+// X86Windows:  store <4 x float> [[TMP3]], ptr %r, align 16
+// CHECK:       ret void
+void second_i32_v4f32(int x, float4 * y, float4* r)
+{
+  *r = second<int,float4>(x, *y);
+}
+
+    
+// CHECK-LABEL: define{{.*}} void @first_v4f32_i32(ptr{{.*}} %x, i32{{.*}} %y, ptr{{.*}} %r)
+// CHECK:       entry:
+// X86Linux:    [[TMP0:%.*]] = load <4 x float>, ptr %x, align 16
+// X86Linux:    store <4 x float> [[TMP0]], ptr %r, align 16
+// X64SystemV:  [[TMP0:%.*]] = load <4 x float>, ptr %x, align 16
+// X64SystemV:  store <4 x float> [[TMP0]], ptr %r, align 16
+// X86Darwin:   [[TMP0:%.*]] = load <2 x i64>, ptr %x, align 16
+// X86Darwin:   store <2 x i64> [[TMP0]], ptr %r, align 16
+// CHECK:       ret void
+  void first_v4f32_i32(float4* x, int y, float4* r)
+{
+ *r =first<float4,int>(*x, y);
+}
+
+// CHECK-LABEL: define{{.*}} i32 @second_v4f32_i32(ptr{{.*}} %x, i32{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %y
+int second_v4f32_i32(float4* x, int y)
+{
+  return second<float4,int>(*x, y);
+}
+
+}
+
+// A large struct with awkwardly aligned fields
+
+typedef struct {
+  char c;
+  short s;
+  int i;
+  long l;
+  float f;
+  double d;
+} libcS;
+
+extern "C"
+{
+
+// CHECK-LABEL: define{{.*}} i32 @first_i32_libcS(i32{{.*}} %x, ptr{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %x
+int first_i32_libcS(int x, libcS * y)
+{
+  return first<int,libcS>(x, *y);
+}
+  
+// CHECK-LABEL: define{{.*}} void @second_i32_libcS(i32{{.*}} %x, ptr{{.*}} %y, ptr{{.*}} %r)
+// CHECK:       entry:
+// CHECK:       ret void
+void second_i32_libcS(int x, libcS * y, libcS* r)
+{
+  *r = second<int,libcS>(x, *y);
+}
+
+    
+// CHECK-LABEL: define{{.*}} void @first_libcS_i32(ptr{{.*}} %x, i32{{.*}} %y, ptr{{.*}} %r)
+// CHECK:       entry:
+
+  void first_libcS_i32(libcS* x, int y, libcS* r)
+{
+ *r =first<libcS,int>(*x, y);
+}
+
+// CHECK-LABEL: define{{.*}} i32 @second_libcS_i32(ptr{{.*}} %x, i32{{.*}} %y)
+// CHECK:       entry:
+// CHECK:       ret i32 %y
+int second_libcS_i32(libcS* x, int y)
+{
+  return second<libcS,int>(*x, y);
+}
+
+  
+}
+
+
+
+            
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 9ba75d491c1c9c..5da681781da975 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -106,6 +106,7 @@ void initializeExpandLargeDivRemLegacyPassPass(PassRegistry&);
 void initializeExpandMemCmpLegacyPassPass(PassRegistry &);
 void initializeExpandPostRAPass(PassRegistry&);
 void initializeExpandReductionsPass(PassRegistry&);
+void initializeExpandVariadicsPass(PassRegistry &);
 void initializeExpandVectorPredicationPass(PassRegistry &);
 void initializeExternalAAWrapperPassPass(PassRegistry&);
 void initializeFEntryInserterPass(PassRegistry&);
diff --git a/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h b/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h
new file mode 100644
index 00000000000000..67fa746813ea0e
--- /dev/null
+++ b/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h
@@ -0,0 +1,43 @@
+//===- ExpandVariadics.h - expand variadic functions ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORMS_IPO_EXPANDVARIADICS_H
+#define LLVM_TRANSFORMS_IPO_EXPANDVARIADICS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class Module;
+class ModulePass;
+class OptimizationLevel;
+
+enum class ExpandVariadicsMode {
+  unspecified,
+  disable,
+  optimize,
+  lowering,
+};
+
+class ExpandVariadicsPass : public PassInfoMixin<ExpandVariadicsPass> {
+  const ExpandVariadicsMode ConstructedMode;
+
+public:
+  // Operates under passed mode unless overridden on commandline
+  ExpandVariadicsPass(ExpandVariadicsMode ConstructedMode);
+
+  // Chooses disable or optimize based on optimization level
+  ExpandVariadicsPass(OptimizationLevel Level);
+
+  PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+};
+
+ModulePass *createExpandVariadicsPass(ExpandVariadicsMode);
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_IPO_EXPANDVARIADICS_H
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 8d408ca2363a98..ada05770461151 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -135,6 +135,7 @@
 #include "llvm/Transforms/IPO/DeadArgumentElimination.h"
 #include "llvm/Transforms/IPO/ElimAvailExtern.h"
 #include "llvm/Transforms/IPO/EmbedBitcodePass.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionImport.h"
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 3bb2ce0ae3460b..b1a306e15fa3c0 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -48,6 +48,7 @@
 #include "llvm/Transforms/IPO/DeadArgumentElimination.h"
 #include "llvm/Transforms/IPO/ElimAvailExtern.h"
 #include "llvm/Transforms/IPO/EmbedBitcodePass.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
 #include "llvm/Transforms/IPO/ForceFunctionAttrs.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/IPO/GlobalDCE.h"
@@ -1172,6 +1173,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   if (EnablePGOForceFunctionAttrs)
     MPM.addPass(PGOForceFunctionAttrsPass(PGOOpt->ColdOptType));
 
+  // ExpandVariadics interacts well with the function inliner.
+  MPM.addPass(ExpandVariadicsPass(Level));
+
   MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true));
 
   if (EnableModuleInliner)
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 2fbc7f7d88ba39..d7d29383f9846d 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -59,6 +59,7 @@ MODULE_PASS("dot-callgraph", CallGraphDOTPrinterPass())
 MODULE_PASS("dxil-upgrade", DXILUpgradePass())
 MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass())
 MODULE_PASS("extract-blocks", BlockExtractorPass({}, false))
+MODULE_PASS("expand-variadics", ExpandVariadicsPass(OptimizationLevel::O0))
 MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
 MODULE_PASS("function-import", FunctionImportPass())
 MODULE_PASS("globalopt", GlobalOptPass())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 90f36fadf35903..70d634edc35e11 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -24,6 +24,7 @@ MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
 MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
+MODULE_PASS("expand-variadics", ExpandVariadicsPass(ExpandVariadicsMode::lowering))
 #undef MODULE_PASS
 
 #ifndef FUNCTION_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index f7e552177d6f50..99ae8a17134a78 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -54,6 +54,7 @@
 #include "llvm/Transforms/HipStdPar/HipStdPar.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
 #include "llvm/Transforms/IPO/GlobalDCE.h"
 #include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/Scalar.h"
@@ -967,6 +968,8 @@ void AMDGPUPassConfig::addIRPasses() {
   if (isPassEnabled(EnableImageIntrinsicOptimizer))
     addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
 
+  addPass(createExpandVariadicsPass(ExpandVariadicsMode::lowering));
+
   // Function calls are not supported, so make sure we inline everything.
   addPass(createAMDGPUAlwaysInlinePass());
   addPass(createAlwaysInlinerLegacyPass());
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 78f48652c9920f..e5db0114592877 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
@@ -323,6 +324,9 @@ void NVPTXPassConfig::addIRPasses() {
       AAR.addAAResult(WrapperPass->getResult());
   }));
 
+  // Should run before anything (else!) that adjusts calling conventions
+  addPass(createExpandVariadicsPass(ExpandVariadicsMode::lowering));
+
   // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
   // it here does nothing.  But since we need it for correctness when lowering
   // to NVPTX, run it here too, in case whoever built our pass pipeline didn't
diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt
index 5fbdbc3a014f9a..92a9697720efd4 100644
--- a/llvm/lib/Transforms/IPO/CMakeLists.txt
+++ b/llvm/lib/Transforms/IPO/CMakeLists.txt
@@ -12,6 +12,7 @@ add_llvm_component_library(LLVMipo
   DeadArgumentElimination.cpp
   ElimAvailExtern.cpp
   EmbedBitcodePass.cpp
+  ExpandVariadics.cpp
   ExtractGV.cpp
   ForceFunctionAttrs.cpp
   FunctionAttrs.cpp
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
new file mode 100644
index 00000000000000..e85f91e4060c49
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -0,0 +1,1056 @@
+//===-- ExpandVariadicsPass.cpp --------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an optimization pass for variadic functions. If called from codegen,
+// it can serve as the implementation of variadic functions for a given target.
+//
+// The strategy is to turn the ... part of a varidic function into a va_list
+// and fix up the call sites. This is completely effective if the calling
+// convention can declare that to be the right thing, e.g. on GPUs or where
+// the application is wholly statically linked. In the usual case, it will
+// replace known calls to known variadic functions with calls that are amenable
+// to inlining and other optimisations.
+//
+// The target-dependent parts are in class VariadicABIInfo. Enabling a new
+// target means adding a case to VariadicABIInfo::create() along with tests.
+// This will be especially simple if the va_list representation is a char*.
+//
+// The majority of the plumbing is splitting the variadic function into a
+// single basic block that packs the variadic arguments into a va_list and
+// a second function that does the work of the original. The target specific
+// part is packing arguments into a contiguous buffer that the clang expansion
+// of va_arg will do the right thing with.
+//
+// The aggregate effect is to unblock other transforms, most critically the
+// general purpose inliner. Known calls to variadic functions become zero cost.
+//
+// Consistency with clang is primarily tested by emitting va_arg using clang
+// then expanding the variadic functions using this pass, followed by trying
+// to constant fold the functions to no-ops.
+//
+// Target specific behaviour is tested in IR - mainly checking that values are
+// put into positions in call frames that make sense for that particular target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/ExpandVariadics.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Passes/OptimizationLevel.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/TargetParser/Triple.h"
+
+#include <cstdio>
+
+#define DEBUG_TYPE "expand-variadics"
+
+using namespace llvm;
+
+cl::opt<ExpandVariadicsMode> ExpandVariadicsModeOption(
+    DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE),
+    cl::init(ExpandVariadicsMode::unspecified),
+    cl::values(clEnumValN(ExpandVariadicsMode::unspecified, "unspecified",
+                          "Use the implementation defaults"),
+               clEnumValN(ExpandVariadicsMode::disable, "disable",
+                          "Disable the pass entirely"),
+               clEnumValN(ExpandVariadicsMode::optimize, "optimize",
+                          "Optimise without changing ABI"),
+               clEnumValN(ExpandVariadicsMode::lowering, "lowering",
+                          "Change variadic calling convention")));
+
+namespace {
+
+// Module implements getFunction() which returns nullptr on missing declaration
+// and getOrInsertFunction which creates one when absent. Intrinsics.h
+// implements getDeclaration which creates one when missing. This should be
+// changed to be consistent with Module()'s naming. Implementing as a local
+// function here in the meantime to decouple from that process.
+Function *getPreexistingDeclaration(Module *M, Intrinsic::ID id,
+                                    ArrayRef<Type *> Tys = std::nullopt) {
+  auto *FT = Intrinsic::getType(M->getContext(), id, Tys);
+  return M->getFunction(Tys.empty() ? Intrinsic::getName(id)
+                                    : Intrinsic::getName(id, Tys, M, FT));
+}
+
+// Lots of targets use a void* pointed at a buffer for va_list.
+// Some use more complicated iterator constructs. Type erase that
+// so the rest of the pass can operation on either.
+// Virtual functions where different targets want different behaviour,
+// normal where all implemented targets presently have the same.
+struct VAListInterface {
+  virtual ~VAListInterface() {}
+
+  // Whether a valist instance is passed by value or by address
+  // I.e. does it need to be alloca'ed and stored into, or can
+  // it be passed directly in a SSA register
+  virtual bool passedInSSARegister() = 0;
+
+  // The type of a va_list iterator object
+  virtual Type *vaListType(LLVMContext &Ctx) = 0;
+
+  // The type of a va_list as a function argument as lowered by C
+  virtual Type *vaListParameterType(Module &M) = 0;
+
+  // Initialise an allocated va_list object to point to an already
+  // initialised contiguous memory region.
+  // Return the value to pass as the va_list argument
+  virtual Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+                                  AllocaInst *, Value * /*buffer*/) = 0;
+
+  // Simple lowering suffices for va_end, va_copy for current targets
+  bool vaEndIsNop() { return true; }
+  bool vaCopyIsMemcpy() { return true; }
+};
+
+// The majority case - a void* into an alloca
+struct VoidPtr final : public VAListInterface {
+  bool passedInSSARegister() override { return true; }
+
+  Type *vaListType(LLVMContext &Ctx) override {
+    return PointerType::getUnqual(Ctx);
+  }
+
+  Type *vaListParameterType(Module &M) override {
+    return PointerType::getUnqual(M.getContext());
+  }
+
+  Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+                          AllocaInst * /*va_list*/, Value *buffer) override {
+    return buffer;
+  }
+};
+
+struct VoidPtrAllocaAddrspace final : public VAListInterface {
+
+  bool passedInSSARegister() override { return true; }
+
+  Type *vaListType(LLVMContext &Ctx) override {
+    return PointerType::getUnqual(Ctx);
+  }
+
+  Type *vaListParameterType(Module &M) override {
+    const DataLayout &DL = M.getDataLayout();
+    return DL.getAllocaPtrType(M.getContext());
+  }
+
+  Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+                          AllocaInst * /*va_list*/, Value *buffer) override {
+    return buffer;
+  }
+};
+
+// SystemV as used by X64 Linux and others
+struct SystemV final : public VAListInterface {
+  bool passedInSSARegister() override { return false; }
+
+  Type *vaListType(LLVMContext &Ctx) override {
+    auto I32 = Type::getInt32Ty(Ctx);
+    auto Ptr = PointerType::getUnqual(Ctx);
+    return ArrayType::get(StructType::get(Ctx, {I32, I32, Ptr, Ptr}), 1);
+  }
+
+  Type *vaListParameterType(Module &M) override {
+    return PointerType::getUnqual(M.getContext());
+  }
+
+  Value *initializeVAList(LLVMContext &Ctx, IRBuilder<> &Builder,
+                          AllocaInst *VaList, Value *VoidBuffer) override {
+    assert(VaList->getAllocatedType() == vaListType(Ctx));
+
+    Type *VaListTy = vaListType(Ctx);
+
+    Type *I32 = Type::getInt32Ty(Ctx);
+    Type *I64 = Type::getInt64Ty(Ctx);
+
+    Value *Idxs[3] = {
+        ConstantInt::get(I64, 0),
+        ConstantInt::get(I32, 0),
+        nullptr,
+    };
+
+    Idxs[2] = ConstantInt::get(I32, 0);
+    Builder.CreateStore(
+        ConstantInt::get(I32, 48),
+        Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "gp_offset"));
+
+    Idxs[2] = ConstantInt::get(I32, 1);
+    Builder.CreateStore(
+        ConstantInt::get(I32, 6 * 8 + 8 * 16),
+        Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "fp_offset"));
+
+    Idxs[2] = ConstantInt::get(I32, 2);
+    Builder.CreateStore(
+        VoidBuffer,
+        Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "overfow_arg_area"));
+
+    Idxs[2] = ConstantInt::get(I32, 3);
+    Builder.CreateStore(
+        ConstantPointerNull::get(PointerType::getUnqual(Ctx)),
+        Builder.CreateInBoundsGEP(VaListTy, VaList, Idxs, "reg_save_area"));
+
+    return VaList;
+  }
+};
+
+class VariadicABIInfo {
+
+  VariadicABIInfo(uint32_t MinAlign, uint32_t MaxAlign,
+                  std::unique_ptr<VAListInterface> VAList)
+      : MinAlign(MinAlign), MaxAlign(MaxAlign), VAList(std::move(VAList)) {}
+
+  template <typename T>
+  static VariadicABIInfo create(uint32_t MinAlign, uint32_t MaxAlign) {
+    return {MinAlign, MaxAlign, std::make_unique<T>()};
+  }
+
+public:
+  const uint32_t MinAlign;
+  const uint32_t MaxAlign;
+  std::unique_ptr<VAListInterface> VAList;
+
+  VariadicABIInfo() : VariadicABIInfo(0, 0, nullptr) {}
+  explicit operator bool() const { return static_cast<bool>(VAList); }
+
+  VariadicABIInfo(VariadicABIInfo &&Self)
+      : MinAlign(Self.MinAlign), MaxAlign(Self.MaxAlign),
+        VAList(Self.VAList.release()) {}
+
+  VariadicABIInfo &operator=(VariadicABIInfo &&Other) {
+    this->~VariadicABIInfo();
+    new (this) VariadicABIInfo(std::move(Other));
+    return *this;
+  }
+
+  static VariadicABIInfo create(llvm::Triple const &Triple) {
+    const bool IsLinuxABI = Triple.isOSLinux() || Triple.isOSCygMing();
+
+    switch (Triple.getArch()) {
+
+    case Triple::r600:
+    case Triple::amdgcn: {
+      return create<VoidPtrAllocaAddrspace>(1, 0);
+    }
+
+    case Triple::nvptx:
+    case Triple::nvptx64: {
+      return create<VoidPtr>(4, 0);
+    }
+
+    case Triple::x86: {
+      // These seem to all fall out the same, despite getTypeStackAlign
+      // implying otherwise.
+
+      if (Triple.isOSDarwin()) {
+        // X86_32ABIInfo::getTypeStackAlignInBytes is misleading for this.
+        // The slotSize(4) implies a minimum alignment
+        // The AllowHigherAlign = true means there is no maximum alignment.
+
+        return create<VoidPtr>(4, 0);
+      }
+      if (Triple.getOS() == llvm::Triple::Win32) {
+        return create<VoidPtr>(4, 0);
+      }
+
+      if (IsLinuxABI) {
+        return create<VoidPtr>(4, 0);
+      }
+
+      break;
+    }
+
+    case Triple::x86_64: {
+      if (Triple.isWindowsMSVCEnvironment() || Triple.isOSWindows()) {
+        // x64 msvc emit vaarg passes > 8 byte values by pointer
+        // however the variadic call instruction created does not, e.g.
+        // a <4 x f32> will be passed as itself, not as a pointer or byval.
+        // Postponing resolution of that for now.
+        // Expected min/max align of 8.
+        return {};
+      }
+
+      // SystemV X64 documented behaviour:
+      // Slots are at least eight byte aligned and at most 16 byte aligned.
+      // If the type needs more than sixteen byte alignment, it still only gets
+      // that much alignment on the stack.
+      // X64 behaviour in clang:
+      // Slots are at least eight byte aligned and at most naturally aligned
+      // This matches clang, not the ABI docs.
+
+      if (Triple.isOSDarwin()) {
+        return create<SystemV>(8, 8);
+      }
+
+      if (IsLinuxABI) {
+        return create<SystemV>(8, 8);
+      }
+
+      break;
+    }
+
+    default:
+      break;
+    }
+
+    return {};
+  }
+};
+
+class ExpandVariadics : public ModulePass {
+
+  // The pass construction sets the default (optimize when called from middle
+  // end, lowering when called from the backend). The command line variable
+  // overrides that. This is useful for testing and debugging. It also allows
+  // building an applications with variadic functions wholly removed if one
+  // has sufficient control over the dependencies, e.g. a statically linked
+  // clang that has no variadic function calls remaining in the binary.
+  static ExpandVariadicsMode
+  withCommandLineOverride(ExpandVariadicsMode LLVMRequested) {
+    ExpandVariadicsMode UserRequested = ExpandVariadicsModeOption;
+    return (UserRequested == ExpandVariadicsMode::unspecified) ? LLVMRequested
+                                                               : UserRequested;
+  }
+
+public:
+  static char ID;
+  const ExpandVariadicsMode Mode;
+  VariadicABIInfo ABI;
+
+  ExpandVariadics(ExpandVariadicsMode Mode)
+      : ModulePass(ID), Mode(withCommandLineOverride(Mode)) {}
+  StringRef getPassName() const override { return "Expand variadic functions"; }
+
+  // Rewrite a variadic call site
+  bool expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB, FunctionType *,
+                  Function *NF);
+
+  // Given a variadic function, return a function taking a va_list that can be
+  // called instead of the original. Mutates F.
+  Function *deriveInlinableVariadicFunctionPair(Module &M, IRBuilder<> &Builder,
+                                                Function &F);
+
+  bool runOnFunction(Module &M, IRBuilder<> &Builder, Function *F);
+
+  // Entry point
+  bool runOnModule(Module &M) override;
+
+  bool rewriteABI() { return Mode == ExpandVariadicsMode::lowering; }
+
+  void memcpyVAListPointers(const DataLayout &DL, IRBuilder<> &Builder,
+                            Value *Dst, Value *Src) {
+    auto &Ctx = Builder.getContext();
+    Type *VaListTy = ABI.VAList->vaListType(Ctx);
+    uint64_t Size = DL.getTypeAllocSize(VaListTy).getFixedValue();
+    // todo: on amdgcn this should be in terms of addrspace 5
+    Builder.CreateMemCpyInline(Dst, {}, Src, {},
+                               ConstantInt::get(Type::getInt32Ty(Ctx), Size));
+  }
+
+  bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL,
+                             VAStartInst *Inst);
+
+  bool expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &,
+                             VAEndInst *Inst);
+
+  bool expandVAIntrinsicCall(IRBuilder<> &Builder, const DataLayout &DL,
+                             VACopyInst *Inst);
+
+  template <Intrinsic::ID ID, typename InstructionType>
+  bool expandIntrinsicUsers(Module &M, IRBuilder<> &Builder,
+                            PointerType *ArgType) {
+    bool Changed = false;
+    const DataLayout &DL = M.getDataLayout();
+    if (Function *Intrinsic = getPreexistingDeclaration(&M, ID, {ArgType})) {
+      for (User *U : Intrinsic->users()) {
+        if (auto *I = dyn_cast<InstructionType>(U)) {
+          Changed |= expandVAIntrinsicCall(Builder, DL, I);
+        }
+      }
+      if (Intrinsic->use_empty())
+        Intrinsic->eraseFromParent();
+    }
+    return Changed;
+  }
+
+  FunctionType *inlinableVariadicFunctionType(Module &M, FunctionType *FTy) {
+    SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end());
+    ArgTypes.push_back(ABI.VAList->vaListParameterType(M));
+    return FunctionType::get(FTy->getReturnType(), ArgTypes,
+                             /*IsVarArgs*/ false);
+  }
+
+  static ConstantInt *sizeOfAlloca(LLVMContext &Ctx, const DataLayout &DL,
+                                   AllocaInst *Alloced) {
+    Type *AllocaType = Alloced->getAllocatedType();
+    TypeSize AllocaTypeSize = DL.getTypeAllocSize(AllocaType);
+    uint64_t AsInt = AllocaTypeSize.getFixedValue();
+    return ConstantInt::get(Type::getInt64Ty(Ctx), AsInt);
+  }
+
+  static SmallSet<unsigned, 2> supportedAddressSpaces(const DataLayout &DL) {
+    // FIXME: It looks like a module can contain arbitrary integers for address
+    // spaces in which case we might need to check _lots_ of cases. Maybe add a
+    // rule to the verifier that the vastart/vaend intrinsics can have arguments
+    // in 0 or in allocaaddrspace but nowhere else
+    SmallSet<unsigned, 2> Set;
+    Set.insert(0); // things tend to end up in zero
+    Set.insert(
+        DL.getAllocaAddrSpace()); // the argument should be in alloca addrspace
+    return Set;
+  }
+
+  // this could be partially target specific
+  bool expansionApplicableToFunction(Module &M, Function *F) {
+    if (F->isIntrinsic() || !F->isVarArg() ||
+        F->hasFnAttribute(Attribute::Naked)) {
+      return false;
+    }
+
+    // TODO: work out what to do with the cs_chain functions documented as
+    // non-variadic that are variadic in some lit tests
+    if (F->getCallingConv() != CallingConv::C)
+      return false;
+
+    if (!rewriteABI()) {
+      // e.g. can't replace a weak function unless changing the original symbol
+      if (GlobalValue::isInterposableLinkage(F->getLinkage())) {
+        return false;
+      }
+    }
+
+    if (!rewriteABI()) {
+      // If optimising, err on the side of leaving things alone
+      for (const Use &U : F->uses()) {
+        const auto *CB = dyn_cast<CallBase>(U.getUser());
+
+        if (!CB)
+          return false;
+
+        if (CB->isMustTailCall())
+          return false;
+
+        if (!CB->isCallee(&U) ||
+            CB->getFunctionType() != F->getFunctionType()) {
+          return false;
+        }
+      }
+    }
+
+    // Branch funnels look like variadic functions but aren't:
+    //
+    // define hidden void @__typeid_typeid1_0_branch_funnel(ptr nest %0, ...) {
+    //  musttail call void (...) @llvm.icall.branch.funnel(ptr %0, ptr @vt1_1,
+    //  ptr @vf1_1, ...) ret void
+    // }
+    //
+    // %1 = call i32 @__typeid_typeid1_0_branch_funnel(ptr nest %vtable, ptr
+    // %obj, i32 1)
+    //
+    // If this function contains a branch funnel intrinsic, don't transform it.
+
+    if (Function *Funnel =
+            getPreexistingDeclaration(&M, Intrinsic::icall_branch_funnel)) {
+      for (const User *U : Funnel->users()) {
+        if (auto *I = dyn_cast<CallBase>(U)) {
+          if (F == I->getFunction()) {
+            return false;
+          }
+        }
+      }
+    }
+
+    return true;
+  }
+
+  bool callinstRewritable(CallBase *CB) {
+    if (CallInst *CI = dyn_cast<CallInst>(CB)) {
+      if (CI->isMustTailCall()) {
+        // Cannot expand musttail calls
+        if (rewriteABI()) {
+          // Todo: Sema?
+          report_fatal_error("Cannot lower musttail variadic call");
+        } else {
+          return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+  class ExpandedCallFrame {
+    // Helper for constructing an alloca instance containing the arguments bound
+    // to the variadic ... parameter, rearranged to allow indexing through a
+    // va_list iterator
+    //
+    // The awkward memory layout is to allow direct access to a contiguous array
+    // of types for the conversion to a struct type
+    enum { N = 4 };
+    SmallVector<Type *, N> FieldTypes;
+    enum Tag { IsByVal, NotByVal, Padding };
+    SmallVector<std::pair<Value *, Tag>, N> Fields;
+
+    template <Tag tag> void append(Type *T, Value *V) {
+      FieldTypes.push_back(T);
+      Fields.push_back({V, tag});
+    }
+
+  public:
+    void value(Type *T, Value *V) { append<NotByVal>(T, V); }
+
+    void byVal(Type *T, Value *V) { append<IsByVal>(T, V); }
+
+    void padding(LLVMContext &Ctx, uint64_t By) {
+      append<Padding>(ArrayType::get(Type::getInt8Ty(Ctx), By), nullptr);
+    }
+
+    size_t size() const { return FieldTypes.size(); }
+    bool empty() const { return FieldTypes.empty(); }
+
+    StructType *asStruct(LLVMContext &Ctx, StringRef Name) {
+      const bool IsPacked = true;
+      return StructType::create(Ctx, FieldTypes,
+                                (Twine(Name) + ".vararg").str(), IsPacked);
+    }
+
+    void initialiseStructAlloca(const DataLayout &DL, IRBuilder<> &Builder,
+                                AllocaInst *Alloced) {
+
+      StructType *VarargsTy = cast<StructType>(Alloced->getAllocatedType());
+
+      for (size_t I = 0; I < size(); I++) {
+        auto [V, tag] = Fields[I];
+        if (!V)
+          continue;
+
+        auto R = Builder.CreateStructGEP(VarargsTy, Alloced, I);
+        if (tag == IsByVal) {
+          Type *ByValType = FieldTypes[I];
+          Builder.CreateMemCpy(R, {}, V, {},
+                               DL.getTypeAllocSize(ByValType).getFixedValue());
+        } else {
+          Builder.CreateStore(V, R);
+        }
+      }
+    }
+  };
+};
+
+bool ExpandVariadics::runOnModule(Module &M) {
+  bool Changed = false;
+  if (Mode == ExpandVariadicsMode::disable)
+    return Changed;
+
+  llvm::Triple Triple(M.getTargetTriple());
+
+  if (Triple.getArch() == Triple::UnknownArch) {
+    // If we don't know the triple, we can't lower varargs
+    return false;
+  }
+
+  ABI = VariadicABIInfo::create(Triple);
+  if (!ABI) {
+    if (Mode == ExpandVariadicsMode::lowering) {
+      report_fatal_error(
+          "Requested variadic lowering is unimplemented on this target");
+    }
+    return Changed;
+  }
+
+  const DataLayout &DL = M.getDataLayout();
+  auto &Ctx = M.getContext();
+  IRBuilder<> Builder(Ctx);
+
+  // At pass input, va_start intrinsics only occur in variadic functions, as
+  // checked by the IR verifier.
+
+  // The lowering pass needs to run on all variadic functions.
+  // The optimise could run on only those that call va_start
+  // in exchange for additional book keeping to avoid transforming
+  // the same function multiple times when it contains multiple va_start.
+  // Leaving that compile time optimisation for a later patch.
+  for (Function &F : llvm::make_early_inc_range(M))
+    Changed |= runOnFunction(M, Builder, &F);
+
+  // After runOnFunction, all known calls to known variadic functions have been
+  // replaced. va_start intrinsics are presently (and invalidly!) only present
+  // in functions thart used to be variadic and have now been mutated to take a
+  // va_list instead. If lowering as opposed to optimising, calls to unknown
+  // variadic functions have also been replaced.
+
+  // Warning: Intrinsics acting on other ones are missed
+  auto CandidateAddressSpaces = supportedAddressSpaces(DL);
+
+  for (unsigned Addrspace : CandidateAddressSpaces) {
+    PointerType *ArgType = PointerType::get(Ctx, Addrspace);
+    Changed |= expandIntrinsicUsers<Intrinsic::vastart, VAStartInst>(M, Builder,
+                                                                     ArgType);
+    Changed |=
+        expandIntrinsicUsers<Intrinsic::vaend, VAEndInst>(M, Builder, ArgType);
+    Changed |= expandIntrinsicUsers<Intrinsic::vacopy, VACopyInst>(M, Builder,
+                                                                   ArgType);
+  }
+
+  // Variadic intrinsics are now gone. The va_start have been replaced with the
+  // equivalent of a va_copy from the newly appended va_list argument, va_end
+  // and va_copy are removed. All that remains is for the lowering pass to find
+  // indirect calls and rewrite those as well.
+
+  if (Mode == ExpandVariadicsMode::lowering) {
+    for (Function &F : llvm::make_early_inc_range(M)) {
+      if (F.isDeclaration())
+        continue;
+
+      // Now need to track down indirect calls. Can't find those
+      // by walking uses of variadic functions, need to crawl the instruction
+      // stream. Fortunately this is only necessary for the ABI rewrite case.
+      for (BasicBlock &BB : F) {
+        for (Instruction &I : llvm::make_early_inc_range(BB)) {
+          if (CallBase *CB = dyn_cast<CallBase>(&I)) {
+            if (CB->isIndirectCall()) {
+              FunctionType *FTy = CB->getFunctionType();
+              if (FTy->isVarArg()) {
+                Changed |= expandCall(M, Builder, CB, FTy, 0);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
+
+bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder,
+                                    Function *F) {
+  bool Changed = false;
+
+  // fprintf(stderr, "Called runOn: %s\n", F->getName().str().c_str());
+
+  // This check might be too coarse - there are probably cases where
+  // splitting a function is bad but it's usable without splitting
+  if (!expansionApplicableToFunction(M, F))
+    return false;
+
+  // TODO: Leave "thunk" attribute functions alone?
+
+  // Need more tests than this. Weak etc. Some are in expansionApplicable.
+  if (F->isDeclaration() && !rewriteABI()) {
+    return false;
+  }
+
+  // TODO: Is the lazy construction here still useful?
+  Function *Equivalent = deriveInlinableVariadicFunctionPair(M, Builder, *F);
+
+  for (User *U : llvm::make_early_inc_range(F->users())) {
+    // TODO: A test where the call instruction takes a variadic function as
+    // a parameter other than the one it is calling
+    if (CallBase *CB = dyn_cast<CallBase>(U)) {
+      Value *calledOperand = CB->getCalledOperand();
+      if (F == calledOperand) {
+        Changed |= expandCall(M, Builder, CB, F->getFunctionType(), Equivalent);
+      }
+    }
+  }
+
+  if (rewriteABI()) {
+    // No direct calls remain to F, remaining uses are things like address
+    // escaping, modulo errors in this implementation.
+    for (User *U : llvm::make_early_inc_range(F->users()))
+      if (CallBase *CB = dyn_cast<CallBase>(U)) {
+        Value *calledOperand = CB->getCalledOperand();
+        if (F == calledOperand) {
+          report_fatal_error(
+              "ExpandVA abi requires eliminating call uses first\n");
+        }
+      }
+
+    Changed = true;
+    // Converting the original variadic function in-place into the equivalent
+    // one.
+    Equivalent->setLinkage(F->getLinkage());
+    Equivalent->setVisibility(F->getVisibility());
+    Equivalent->takeName(F);
+
+    // Indirect calls still need to be patched up
+    // DAE bitcasts it, todo: check block addresses
+    F->replaceAllUsesWith(Equivalent);
+    F->eraseFromParent();
+  }
+
+  return Changed;
+}
+
+Function *ExpandVariadics::deriveInlinableVariadicFunctionPair(
+    Module &M, IRBuilder<> &Builder, Function &F) {
+  // The purpose here is split the variadic function F into two functions
+  // One is a variadic function that bundles the passed argument into a va_list
+  // and passes it to the second function. The second function does whatever
+  // the original F does, except that it takes a va_list instead of the ...
+
+  assert(expansionApplicableToFunction(M, &F));
+
+  auto &Ctx = M.getContext();
+  const DataLayout &DL = M.getDataLayout();
+
+  // Returned value isDeclaration() is equal to F.isDeclaration()
+  // but that invariant is not satisfied throughout this function
+  const bool FunctionIsDefinition = !F.isDeclaration();
+
+  FunctionType *FTy = F.getFunctionType();
+  SmallVector<Type *> ArgTypes(FTy->param_begin(), FTy->param_end());
+  ArgTypes.push_back(ABI.VAList->vaListParameterType(M));
+
+  FunctionType *NFTy = inlinableVariadicFunctionType(M, F.getFunctionType());
+  Function *NF = Function::Create(NFTy, F.getLinkage(), F.getAddressSpace());
+
+  // Note - same attribute handling as DeadArgumentElimination
+  NF->copyAttributesFrom(&F);
+  NF->setComdat(F.getComdat()); // beware weak
+  F.getParent()->getFunctionList().insert(F.getIterator(), NF);
+  NF->setName(F.getName() + ".valist");
+  NF->IsNewDbgInfoFormat = F.IsNewDbgInfoFormat;
+
+  // New function is default visibility and internal
+  // Need to set visibility before linkage to avoid an assert in setVisibility
+  NF->setVisibility(GlobalValue::DefaultVisibility);
+  NF->setLinkage(GlobalValue::InternalLinkage);
+
+  AttrBuilder ParamAttrs(Ctx);
+  ParamAttrs.addAttribute(Attribute::NoAlias);
+
+  // TODO: When can the va_list argument have addAlignmentAttr called on it?
+  // It improves codegen lot in the non-inlined case. Probably target
+  // specific.
+
+  AttributeList Attrs = NF->getAttributes();
+  Attrs = Attrs.addParamAttributes(Ctx, NFTy->getNumParams() - 1, ParamAttrs);
+  NF->setAttributes(Attrs);
+
+  // Splice the implementation into the new function with minimal changes
+  if (FunctionIsDefinition) {
+    NF->splice(NF->begin(), &F);
+
+    auto NewArg = NF->arg_begin();
+    for (Argument &Arg : F.args()) {
+      Arg.replaceAllUsesWith(NewArg);
+      NewArg->setName(Arg.getName()); // takeName without killing the old one
+      ++NewArg;
+    }
+    NewArg->setName("varargs");
+  }
+
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  F.getAllMetadata(MDs);
+  for (auto [KindID, Node] : MDs)
+    NF->addMetadata(KindID, *Node);
+
+  if (FunctionIsDefinition) {
+    // The blocks have been stolen so it's now a declaration
+    assert(F.isDeclaration());
+    Type *VaListTy = ABI.VAList->vaListType(Ctx);
+
+    auto *BB = BasicBlock::Create(Ctx, "entry", &F);
+    Builder.SetInsertPoint(BB);
+
+    Value *VaListInstance = Builder.CreateAlloca(VaListTy, nullptr, "va_list");
+
+    Builder.CreateIntrinsic(Intrinsic::vastart, {DL.getAllocaPtrType(Ctx)},
+                            {VaListInstance});
+
+    SmallVector<Value *> Args;
+    for (Argument &A : F.args())
+      Args.push_back(&A);
+
+    // Shall we put the extra arg in alloca addrspace? Probably yes
+    VaListInstance = Builder.CreatePointerBitCastOrAddrSpaceCast(
+        VaListInstance, ABI.VAList->vaListParameterType(M));
+    Args.push_back(VaListInstance);
+
+    CallInst *Result = Builder.CreateCall(NF, Args);
+    Result->setTailCallKind(CallInst::TCK_Tail);
+
+    assert(ABI.VAList->vaEndIsNop()); // If this changes, insert a va_end here
+
+    if (Result->getType()->isVoidTy())
+      Builder.CreateRetVoid();
+    else
+      Builder.CreateRet(Result);
+  }
+
+  assert(F.isDeclaration() == NF->isDeclaration());
+
+  return NF;
+}
+
+bool ExpandVariadics::expandCall(Module &M, IRBuilder<> &Builder, CallBase *CB,
+                                 FunctionType *VarargFunctionType,
+                                 Function *NF) {
+  bool Changed = false;
+  const DataLayout &DL = M.getDataLayout();
+
+  if (!callinstRewritable(CB)) {
+    return Changed;
+  }
+
+  // This is something of a problem because the call instructions' idea of the
+  // function type doesn't necessarily match reality, before or after this
+  // pass
+  // Since the plan here is to build a new instruction there is no
+  // particular benefit to trying to preserve an incorrect initial type
+  // If the types don't match and we aren't changing ABI, leave it alone
+  // in case someone is deliberately doing dubious type punning through a
+  // varargs.
+  FunctionType *FuncType = CB->getFunctionType();
+  if (FuncType != VarargFunctionType) {
+    if (!rewriteABI()) {
+      return Changed;
+    }
+    FuncType = VarargFunctionType;
+  }
+
+  auto &Ctx = CB->getContext();
+
+  // Align the struct on ABI.MinAlign to start with
+  Align MaxFieldAlign(ABI.MinAlign ? ABI.MinAlign : 1);
+
+  // The strategy here is to allocate a call frame containing the variadic
+  // arguments laid out such that a target specific va_list can be initialised
+  // with it, such that target specific va_arg instructions will correctly
+  // iterate over it. Primarily this means getting the alignment right.
+
+  ExpandedCallFrame Frame;
+
+  uint64_t CurrentOffset = 0;
+  for (unsigned I = FuncType->getNumParams(), E = CB->arg_size(); I < E; ++I) {
+    Value *ArgVal = CB->getArgOperand(I);
+    bool IsByVal = CB->paramHasAttr(I, Attribute::ByVal);
+    Type *ArgType = IsByVal ? CB->getParamByValType(I) : ArgVal->getType();
+    Align DataAlign = DL.getABITypeAlign(ArgType);
+
+    uint64_t DataAlignV = DataAlign.value();
+
+    // Currently using 0 as a sentinel to mean ignored
+    if (ABI.MinAlign && DataAlignV < ABI.MinAlign)
+      DataAlignV = ABI.MinAlign;
+    if (ABI.MaxAlign && DataAlignV > ABI.MaxAlign)
+      DataAlignV = ABI.MaxAlign;
+
+    DataAlign = Align(DataAlignV);
+    MaxFieldAlign = std::max(MaxFieldAlign, DataAlign);
+
+    if (uint64_t Rem = CurrentOffset % DataAlignV) {
+      // Inject explicit padding to deal with alignment requirements
+      uint64_t Padding = DataAlignV - Rem;
+      Frame.padding(Ctx, Padding);
+      CurrentOffset += Padding;
+    }
+
+    if (IsByVal) {
+      Frame.byVal(ArgType, ArgVal);
+    } else {
+      Frame.value(ArgType, ArgVal);
+    }
+    CurrentOffset += DL.getTypeAllocSize(ArgType).getFixedValue();
+  }
+
+  if (Frame.empty()) {
+    // Not passing any arguments, hopefully va_arg won't try to read any
+    // Creating a single byte frame containing nothing to point the va_list
+    // instance as that is less special-casey in the compiler and probably
+    // easier to interpret in a debugger.
+    Frame.padding(Ctx, 1);
+  }
+
+  Function *CBF = CB->getParent()->getParent();
+
+  StructType *VarargsTy = Frame.asStruct(Ctx, CBF->getName());
+
+  // Put the alloca to hold the variadic args in the entry basic block.
+  // The clumsy construction is to set a the alignment on the instance
+  Builder.SetInsertPointPastAllocas(CBF);
+
+  // The struct instance needs to be at least MaxFieldAlign for the alignment of
+  // the fields to be correct at runtime. Use the native stack alignment instead
+  // if that's greater as that tends to give better codegen.
+  Align AllocaAlign = MaxFieldAlign;
+  if (DL.exceedsNaturalStackAlignment(Align(1024))) {
+    // TODO: DL.getStackAlignment could return a MaybeAlign instead of assert
+    AllocaAlign = std::max(AllocaAlign, DL.getStackAlignment());
+  }
+
+  AllocaInst *Alloced = Builder.Insert(
+      new AllocaInst(VarargsTy, DL.getAllocaAddrSpace(), nullptr, AllocaAlign),
+      "vararg_buffer");
+  Changed = true;
+  assert(Alloced->getAllocatedType() == VarargsTy);
+
+  // Initialise the fields in the struct
+  Builder.SetInsertPoint(CB);
+
+  Builder.CreateLifetimeStart(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
+
+  Frame.initialiseStructAlloca(DL, Builder, Alloced);
+
+  unsigned NumArgs = FuncType->getNumParams();
+
+  SmallVector<Value *> Args;
+  Args.assign(CB->arg_begin(), CB->arg_begin() + NumArgs);
+
+  // Initialise a va_list pointing to that struct and pass it as the last
+  // argument
+  AllocaInst *VaList = nullptr;
+  {
+    if (!ABI.VAList->passedInSSARegister()) {
+      Type *VaListTy = ABI.VAList->vaListType(Ctx);
+      Builder.SetInsertPointPastAllocas(CBF);
+      VaList = Builder.CreateAlloca(VaListTy, nullptr, "va_list");
+      Builder.SetInsertPoint(CB);
+      Builder.CreateLifetimeStart(VaList, sizeOfAlloca(Ctx, DL, VaList));
+    }
+    Args.push_back(ABI.VAList->initializeVAList(Ctx, Builder, VaList, Alloced));
+  }
+
+  // Attributes excluding any on the vararg arguments
+  AttributeList PAL = CB->getAttributes();
+  if (!PAL.isEmpty()) {
+    SmallVector<AttributeSet, 8> ArgAttrs;
+    for (unsigned ArgNo = 0; ArgNo < NumArgs; ArgNo++)
+      ArgAttrs.push_back(PAL.getParamAttrs(ArgNo));
+    PAL =
+        AttributeList::get(Ctx, PAL.getFnAttrs(), PAL.getRetAttrs(), ArgAttrs);
+  }
+
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  CB->getOperandBundlesAsDefs(OpBundles);
+
+  CallBase *NewCB = nullptr;
+  // TODO, other instructions? Haven't managed to write variadic inline asm yet
+  if (CallInst *CI = dyn_cast<CallInst>(CB)) {
+
+    Value *Dst = NF ? NF : CI->getCalledOperand();
+    FunctionType *NFTy = inlinableVariadicFunctionType(M, VarargFunctionType);
+
+    NewCB = CallInst::Create(NFTy, Dst, Args, OpBundles, "", CI);
+
+    CallInst::TailCallKind TCK = CI->getTailCallKind();
+    assert(TCK != CallInst::TCK_MustTail); // guarded at prologue
+
+    // It doesn't get to be a tail call any more
+    // might want to guard this with arch, x64 and aarch64 document that
+    // varargs can't be tail called anyway
+    // Not totally convinced this is necessary but dead store elimination
+    // will discard the stores to the Alloca and pass uninitialised memory along
+    // instead when the function is marked tailcall
+    if (TCK == CallInst::TCK_Tail) {
+      TCK = CallInst::TCK_None;
+    }
+    CI->setTailCallKind(TCK);
+
+  } else if (InvokeInst *II = dyn_cast<InvokeInst>(CB)) {
+    assert(NF);
+    NewCB = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                               Args, OpBundles, "", CB);
+  } else {
+    report_fatal_error("Unimplemented variadic lowering for CallInst");
+  }
+
+  if (VaList)
+    Builder.CreateLifetimeEnd(VaList, sizeOfAlloca(Ctx, DL, VaList));
+
+  Builder.CreateLifetimeEnd(Alloced, sizeOfAlloca(Ctx, DL, Alloced));
+
+  NewCB->setAttributes(PAL);
+  NewCB->takeName(CB);
+  NewCB->setCallingConv(CB->getCallingConv());
+
+  NewCB->setDebugLoc(DebugLoc());
+
+  // I think this is upsetting the debug handling (DISubprogram attached to more
+  // than one function) Need to move metadata, not copy it?
+  NewCB->copyMetadata(*CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
+
+  CB->replaceAllUsesWith(NewCB);
+  CB->eraseFromParent();
+  return Changed;
+}
+
+bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &Builder,
+                                            const DataLayout &DL,
+                                            VAStartInst *Inst) {
+  Function *ContainingFunction = Inst->getFunction();
+  if (ContainingFunction->isVarArg())
+    return false;
+
+  // The last argument is a vaListParameterType
+  Argument *PassedVaList =
+      ContainingFunction->getArg(ContainingFunction->arg_size() - 1);
+
+  // va_start takes a pointer to a va_list, e.g. one on the stack
+  Value *VaStartArg = Inst->getArgList();
+
+  Builder.SetInsertPoint(Inst);
+  if (ABI.VAList->passedInSSARegister()) {
+    Builder.CreateStore(PassedVaList, VaStartArg);
+  } else {
+    // src and dst are both pointers
+    memcpyVAListPointers(DL, Builder, VaStartArg, PassedVaList);
+  }
+
+  Inst->eraseFromParent();
+  return true;
+}
+
+bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &, const DataLayout &,
+                                            VAEndInst *Inst) {
+  // A no-op on all the architectures implemented so far
+  Inst->eraseFromParent();
+  return true;
+}
+
+bool ExpandVariadics::expandVAIntrinsicCall(IRBuilder<> &Builder,
+                                            const DataLayout &DL,
+                                            VACopyInst *Inst) {
+  Builder.SetInsertPoint(Inst);
+  memcpyVAListPointers(DL, Builder, Inst->getDest(), Inst->getSrc());
+  Inst->eraseFromParent();
+  return true;
+}
+
+} // namespace
+
+char ExpandVariadics::ID = 0;
+
+INITIALIZE_PASS(ExpandVariadics, DEBUG_TYPE, "Expand variadic functions", false,
+                false)
+
+ModulePass *llvm::createExpandVariadicsPass(ExpandVariadicsMode Mode) {
+  return new ExpandVariadics(Mode);
+}
+
+PreservedAnalyses ExpandVariadicsPass::run(Module &M, ModuleAnalysisManager &) {
+  return ExpandVariadics(ConstructedMode).runOnModule(M)
+             ? PreservedAnalyses::none()
+             : PreservedAnalyses::all();
+}
+
+ExpandVariadicsPass::ExpandVariadicsPass(OptimizationLevel Level)
+    : ExpandVariadicsPass(Level == OptimizationLevel::O0
+                              ? ExpandVariadicsMode::disable
+                              : ExpandVariadicsMode::optimize) {}
+
+ExpandVariadicsPass::ExpandVariadicsPass(ExpandVariadicsMode Mode)
+    : ConstructedMode(Mode) {}
diff --git a/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
new file mode 100644
index 00000000000000..b270bb5bfb74b0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/expand-variadic-call.ll
@@ -0,0 +1,499 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-p9:192:256:256:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8:9"
+target triple = "amdgcn-amd-amdhsa"
+
+; Check the variables are lowered to the locations this target expects
+
+; The types show the call frames
+; CHECK: %single_i32.vararg = type <{ i32 }>
+; CHECK: %single_double.vararg = type <{ double }>
+; CHECK: %single_v4f32.vararg = type <{ <4 x float> }>
+; CHECK: %single_v8f32.vararg = type <{ <8 x float> }>
+; CHECK: %single_v16f32.vararg = type <{ <16 x float> }>
+; CHECK: %single_v32f32.vararg = type <{ <32 x float> }>
+; CHECK: %i32_double.vararg = type <{ i32, [4 x i8], double }>
+; CHECK: %double_i32.vararg = type <{ double, i32 }>
+; CHECK: %i32_libcS.vararg = type <{ i32, i8, [1 x i8], i16, i32, [4 x i8], i64, float, [4 x i8], double }>
+; CHECK: %libcS_i32.vararg = type <{ i8, [1 x i8], i16, i32, i64, float, [4 x i8], double, i32 }>
+; CHECK: %i32_v4f32.vararg = type <{ i32, [12 x i8], <4 x float> }>
+; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }>
+; CHECK: %i32_v8f32.vararg = type <{ i32, [28 x i8], <8 x float> }>
+; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }>
+; CHECK: %i32_v16f32.vararg = type <{ i32, [60 x i8], <16 x float> }>
+; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }>
+; CHECK: %i32_v32f32.vararg = type <{ i32, [124 x i8], <32 x float> }>
+; CHECK: %v32f32_i32.vararg = type <{ <32 x float>, i32 }>
+; CHECK: %indirect_single_i32.vararg = type <{ i32 }>
+
+ at vararg_ptr = hidden addrspace(1) global ptr @vararg, align 8
+
+define hidden void @copy(ptr noundef %va) {
+; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va.addr = alloca ptr, align 8, addrspace(5)
+; CHECK-NEXT:    %cp = alloca ptr, align 8, addrspace(5)
+; CHECK-NEXT:    %va.addr.ascast = addrspacecast ptr addrspace(5) %va.addr to ptr
+; CHECK-NEXT:    %cp.ascast = addrspacecast ptr addrspace(5) %cp to ptr
+; CHECK-NEXT:    store ptr %va, ptr addrspace(5) %va.addr, align 8, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %cp) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %cp.ascast, ptr %va.addr.ascast, i32 8, i1 false)
+; CHECK-NEXT:    %0 = load ptr, ptr addrspace(5) %cp, align 8, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %cp) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %va.addr = alloca ptr, align 8, addrspace(5)
+  %cp = alloca ptr, align 8, addrspace(5)
+  %va.addr.ascast = addrspacecast ptr addrspace(5) %va.addr to ptr
+  %cp.ascast = addrspacecast ptr addrspace(5) %cp to ptr
+  store ptr %va, ptr addrspace(5) %va.addr, align 8, !tbaa !4
+  call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %cp) #2
+  call void @llvm.va_copy.p0(ptr %cp.ascast, ptr nonnull %va.addr.ascast)
+  %0 = load ptr, ptr addrspace(5) %cp, align 8, !tbaa !4
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %cp) #2
+  ret void
+}
+
+declare void @llvm.lifetime.start.p5(i64 immarg, ptr addrspace(5) nocapture) #0
+
+declare void @llvm.va_copy.p0(ptr, ptr) #1
+
+declare hidden void @valist(ptr noundef)
+
+declare void @llvm.lifetime.end.p5(i64 immarg, ptr addrspace(5) nocapture) #0
+
+define hidden void @start_once(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_once(ptr addrspace(5) noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s = alloca ptr, align 8, addrspace(5)
+; CHECK-NEXT:    %s.ascast = addrspacecast ptr addrspace(5) %s to ptr
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s) #2
+; CHECK-NEXT:    store ptr addrspace(5) %varargs, ptr %s.ascast, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr addrspace(5) %s, align 8, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s = alloca ptr, align 8, addrspace(5)
+  %s.ascast = addrspacecast ptr addrspace(5) %s to ptr
+  call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s) #2
+  call void @llvm.va_start.p0(ptr %s.ascast)
+  %0 = load ptr, ptr addrspace(5) %s, align 8, !tbaa !4
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s.ascast)
+  call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s) #2
+  ret void
+}
+
+declare void @llvm.va_start.p0(ptr) #1
+
+declare void @llvm.va_end.p0(ptr) #1
+
+define hidden void @start_twice(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr addrspace(5) noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s0 = alloca ptr, align 8, addrspace(5)
+; CHECK-NEXT:    %s1 = alloca ptr, align 8, addrspace(5)
+; CHECK-NEXT:    %s0.ascast = addrspacecast ptr addrspace(5) %s0 to ptr
+; CHECK-NEXT:    %s1.ascast = addrspacecast ptr addrspace(5) %s1 to ptr
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s1) #2
+; CHECK-NEXT:    store ptr addrspace(5) %varargs, ptr %s0.ascast, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr addrspace(5) %s0, align 8, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    store ptr addrspace(5) %varargs, ptr %s1.ascast, align 4
+; CHECK-NEXT:    %1 = load ptr, ptr addrspace(5) %s1, align 8, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s0) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s0 = alloca ptr, align 8, addrspace(5)
+  %s1 = alloca ptr, align 8, addrspace(5)
+  %s0.ascast = addrspacecast ptr addrspace(5) %s0 to ptr
+  %s1.ascast = addrspacecast ptr addrspace(5) %s1 to ptr
+  call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s0) #2
+  call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %s1) #2
+  call void @llvm.va_start.p0(ptr %s0.ascast)
+  %0 = load ptr, ptr addrspace(5) %s0, align 8, !tbaa !4
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s0.ascast)
+  call void @llvm.va_start.p0(ptr %s1.ascast)
+  %1 = load ptr, ptr addrspace(5) %s1, align 8, !tbaa !4
+  call void @valist(ptr noundef %1) #2
+  call void @llvm.va_end.p0(ptr %s1.ascast)
+  call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s1) #2
+  call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %s0) #2
+  ret void
+}
+
+define hidden void @single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_i32.vararg, align 4, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %0, align 4
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x) #2
+  ret void
+}
+
+declare hidden void @vararg(...)
+
+define hidden void @single_double(double noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_double.vararg, align 8, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 8, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_double.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr addrspace(5) %0, align 8
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 8, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x) #2
+  ret void
+}
+
+define hidden void @single_v4f32(<4 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v4f32.vararg, align 16, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 16, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v4f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr addrspace(5) %0, align 16
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 16, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x) #2
+  ret void
+}
+
+define hidden void @single_v8f32(<8 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v8f32(<8 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v8f32.vararg, align 32, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v8f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr addrspace(5) %0, align 32
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x) #2
+  ret void
+}
+
+define hidden void @single_v16f32(<16 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v16f32(<16 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v16f32.vararg, align 64, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 64, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v16f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr addrspace(5) %0, align 64
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 64, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x) #2
+  ret void
+}
+
+define hidden void @single_v32f32(<32 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v32f32(<32 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v32f32.vararg, align 128, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 128, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v32f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr addrspace(5) %0, align 128
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 128, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x) #2
+  ret void
+}
+
+define hidden void @i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_double.vararg, align 8, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 16, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_double.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_double.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store double %y, ptr addrspace(5) %1, align 8
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 16, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, double noundef %y) #2
+  ret void
+}
+
+define hidden void @double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %double_i32.vararg, align 8, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 12, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %double_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr addrspace(5) %0, align 8
+; CHECK-NEXT:    %1 = getelementptr inbounds %double_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr addrspace(5) %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 12, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define hidden void @i32_libcS(i32 noundef %x, i8 %y.coerce0, i16 %y.coerce1, i32 %y.coerce2, i64 %y.coerce3, float %y.coerce4, double %y.coerce5) {
+; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, i8 %y.coerce0, i16 %y.coerce1, i32 %y.coerce2, i64 %y.coerce3, float %y.coerce4, double %y.coerce5) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_libcS.vararg, align 8, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 40, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i8 %y.coerce0, ptr addrspace(5) %1, align 1
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 3
+; CHECK-NEXT:    store i16 %y.coerce1, ptr addrspace(5) %2, align 2
+; CHECK-NEXT:    %3 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 4
+; CHECK-NEXT:    store i32 %y.coerce2, ptr addrspace(5) %3, align 4
+; CHECK-NEXT:    %4 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 6
+; CHECK-NEXT:    store i64 %y.coerce3, ptr addrspace(5) %4, align 8
+; CHECK-NEXT:    %5 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 7
+; CHECK-NEXT:    store float %y.coerce4, ptr addrspace(5) %5, align 4
+; CHECK-NEXT:    %6 = getelementptr inbounds %i32_libcS.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 9
+; CHECK-NEXT:    store double %y.coerce5, ptr addrspace(5) %6, align 8
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 40, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, i8 %y.coerce0, i16 %y.coerce1, i32 %y.coerce2, i64 %y.coerce3, float %y.coerce4, double %y.coerce5) #2
+  ret void
+}
+
+define hidden void @libcS_i32(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i64 %x.coerce3, float %x.coerce4, double %x.coerce5, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@libcS_i32(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i64 %x.coerce3, float %x.coerce4, double %x.coerce5, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %libcS_i32.vararg, align 8, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i8 %x.coerce0, ptr addrspace(5) %0, align 1
+; CHECK-NEXT:    %1 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store i16 %x.coerce1, ptr addrspace(5) %1, align 2
+; CHECK-NEXT:    %2 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 3
+; CHECK-NEXT:    store i32 %x.coerce2, ptr addrspace(5) %2, align 4
+; CHECK-NEXT:    %3 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 4
+; CHECK-NEXT:    store i64 %x.coerce3, ptr addrspace(5) %3, align 8
+; CHECK-NEXT:    %4 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 5
+; CHECK-NEXT:    store float %x.coerce4, ptr addrspace(5) %4, align 4
+; CHECK-NEXT:    %5 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 7
+; CHECK-NEXT:    store double %x.coerce5, ptr addrspace(5) %5, align 8
+; CHECK-NEXT:    %6 = getelementptr inbounds %libcS_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 8
+; CHECK-NEXT:    store i32 %y, ptr addrspace(5) %6, align 4
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i8 %x.coerce0, i16 %x.coerce1, i32 %x.coerce2, i64 %x.coerce3, float %x.coerce4, double %x.coerce5, i32 noundef %y) #2
+  ret void
+}
+
+define hidden void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v4f32.vararg, align 16, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 32, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v4f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v4f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <4 x float> %y, ptr addrspace(5) %1, align 16
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 32, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <4 x float> noundef %y) #2
+  ret void
+}
+
+define hidden void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v4f32_i32.vararg, align 16, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 20, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v4f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr addrspace(5) %0, align 16
+; CHECK-NEXT:    %1 = getelementptr inbounds %v4f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr addrspace(5) %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 20, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define hidden void @i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v8f32.vararg, align 32, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 64, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v8f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v8f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <8 x float> %y, ptr addrspace(5) %1, align 32
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 64, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <8 x float> noundef %y) #2
+  ret void
+}
+
+define hidden void @v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v8f32_i32.vararg, align 32, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v8f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr addrspace(5) %0, align 32
+; CHECK-NEXT:    %1 = getelementptr inbounds %v8f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr addrspace(5) %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 36, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define hidden void @i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v16f32.vararg, align 64, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 128, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v16f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v16f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <16 x float> %y, ptr addrspace(5) %1, align 64
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 128, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <16 x float> noundef %y) #2
+  ret void
+}
+
+define hidden void @v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v16f32_i32.vararg, align 64, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 68, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v16f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr addrspace(5) %0, align 64
+; CHECK-NEXT:    %1 = getelementptr inbounds %v16f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr addrspace(5) %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 68, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define hidden void @i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v32f32.vararg, align 128, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 256, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v32f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v32f32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <32 x float> %y, ptr addrspace(5) %1, align 128
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 256, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <32 x float> noundef %y) #2
+  ret void
+}
+
+define hidden void @v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v32f32_i32.vararg, align 128, addrspace(5)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 132, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v32f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr addrspace(5) %0, align 128
+; CHECK-NEXT:    %1 = getelementptr inbounds %v32f32_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr addrspace(5) %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 132, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define hidden void @indirect_single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@indirect_single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %indirect_single_i32.vararg, align 4, addrspace(5)
+; CHECK-NEXT:    %0 = load volatile ptr, ptr addrspacecast (ptr addrspace(1) @vararg_ptr to ptr), align 8, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %indirect_single_i32.vararg, ptr addrspace(5) %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr addrspace(5) %1, align 4
+; CHECK-NEXT:    call void %0(ptr addrspace(5) %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load volatile ptr, ptr addrspacecast (ptr addrspace(1) @vararg_ptr to ptr), align 8, !tbaa !4
+  tail call void (...) %0(i32 noundef %x) #2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { mustprogress }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"any pointer", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 0ff5dd3680dfab..ce26565cae8cd1 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -31,6 +31,7 @@
 ; GCN-O0-NEXT:    AMDGPU Remove Incompatible Functions
 ; GCN-O0-NEXT:    AMDGPU Printf lowering
 ; GCN-O0-NEXT:    Lower ctors and dtors for AMDGPU
+; GCN-O0-NEXT:    Expand variadic functions
 ; GCN-O0-NEXT:    AMDGPU Inline All Functions
 ; GCN-O0-NEXT:    Inliner for always_inline functions
 ; GCN-O0-NEXT:      FunctionPass Manager
@@ -177,6 +178,7 @@
 ; GCN-O1-NEXT:    AMDGPU Remove Incompatible Functions
 ; GCN-O1-NEXT:    AMDGPU Printf lowering
 ; GCN-O1-NEXT:    Lower ctors and dtors for AMDGPU
+; GCN-O1-NEXT:    Expand variadic functions
 ; GCN-O1-NEXT:    AMDGPU Inline All Functions
 ; GCN-O1-NEXT:    Inliner for always_inline functions
 ; GCN-O1-NEXT:      FunctionPass Manager
@@ -452,6 +454,7 @@
 ; GCN-O1-OPTS-NEXT:    AMDGPU Remove Incompatible Functions
 ; GCN-O1-OPTS-NEXT:    AMDGPU Printf lowering
 ; GCN-O1-OPTS-NEXT:    Lower ctors and dtors for AMDGPU
+; GCN-O1-OPTS-NEXT:    Expand variadic functions
 ; GCN-O1-OPTS-NEXT:    AMDGPU Inline All Functions
 ; GCN-O1-OPTS-NEXT:    Inliner for always_inline functions
 ; GCN-O1-OPTS-NEXT:      FunctionPass Manager
@@ -757,6 +760,7 @@
 ; GCN-O2-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O2-NEXT:    FunctionPass Manager
 ; GCN-O2-NEXT:      AMDGPU Image Intrinsic Optimizer
+; GCN-O2-NEXT:    Expand variadic functions
 ; GCN-O2-NEXT:    AMDGPU Inline All Functions
 ; GCN-O2-NEXT:    Inliner for always_inline functions
 ; GCN-O2-NEXT:      FunctionPass Manager
@@ -1066,6 +1070,7 @@
 ; GCN-O3-NEXT:    Lower ctors and dtors for AMDGPU
 ; GCN-O3-NEXT:    FunctionPass Manager
 ; GCN-O3-NEXT:      AMDGPU Image Intrinsic Optimizer
+; GCN-O3-NEXT:    Expand variadic functions
 ; GCN-O3-NEXT:    AMDGPU Inline All Functions
 ; GCN-O3-NEXT:    Inliner for always_inline functions
 ; GCN-O3-NEXT:      FunctionPass Manager
diff --git a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
index 694f444b7747da..3cec8035fc8476 100644
--- a/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/unsupported-calls.ll
@@ -43,25 +43,6 @@ define i32 @test_tail_call(ptr addrspace(1) %out, ptr addrspace(1) %in) {
   ret i32 %c
 }
 
-declare void @external.varargs(i32, double, i64, ...)
-
-; GCN: error: <unknown>:0:0: in function test_call_varargs void (): unsupported call to variadic function external.varargs
-; R600: in function test_call_varargs{{.*}}: unsupported call to function external.varargs
-define void @test_call_varargs() {
-  call void (i32, double, i64, ...) @external.varargs(i32 42, double 1.0, i64 12, i8 3, i16 1, i32 4, float 1.0, double 2.0)
-  ret void
-}
-
-declare i32 @extern_variadic(...)
-
-; GCN: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported required tail call to function extern_variadic
-; R600: in function test_tail_call_bitcast_extern_variadic{{.*}}: unsupported call to function extern_variadic
-define i32 @test_tail_call_bitcast_extern_variadic(<4 x float> %arg0, <4 x float> %arg1, i32 %arg2) {
-  %add = fadd <4 x float> %arg0, %arg1
-  %call = tail call i32 @extern_variadic(<4 x float> %add)
-  ret i32 %call
-}
-
 ; R600: in function test_c_call{{.*}}: unsupported call to function defined_function
 define amdgpu_ps i32 @test_c_call_from_shader() {
   %call = call i32 @defined_function(i32 0)
diff --git a/llvm/test/CodeGen/NVPTX/expand-variadic-call.ll b/llvm/test/CodeGen/NVPTX/expand-variadic-call.ll
new file mode 100644
index 00000000000000..0e444cb6b8bf6c
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/expand-variadic-call.ll
@@ -0,0 +1,468 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
+target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; Check the variables are lowered to the locations this target expects
+
+; The types show the call frames
+; CHECK: %single_i32.vararg = type <{ i32 }>
+; CHECK: %single_double.vararg = type <{ double }>
+; CHECK: %single_v4f32.vararg = type <{ <4 x float> }>
+; CHECK: %single_v8f32.vararg = type <{ <8 x float> }>
+; CHECK: %single_v16f32.vararg = type <{ <16 x float> }>
+; CHECK: %single_v32f32.vararg = type <{ <32 x float> }>
+; CHECK: %i32_double.vararg = type <{ i32, [4 x i8], double }>
+; CHECK: %double_i32.vararg = type <{ double, i32 }>
+; CHECK: %i32_v4f32.vararg = type <{ i32, [12 x i8], <4 x float> }>
+; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }>
+; CHECK: %i32_v8f32.vararg = type <{ i32, [28 x i8], <8 x float> }>
+; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }>
+; CHECK: %i32_v16f32.vararg = type <{ i32, [60 x i8], <16 x float> }>
+; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }>
+; CHECK: %i32_v32f32.vararg = type <{ i32, [124 x i8], <32 x float> }>
+; CHECK: %v32f32_i32.vararg = type <{ <32 x float>, i32 }>
+; CHECK: %indirect_single_i32.vararg = type <{ i32 }>
+
+%struct.libcS = type { i8, i16, i32, i64, float, double }
+
+ at vararg_ptr = global ptr @vararg, align 8
+
+define void @copy(ptr noundef %va) {
+; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va.addr = alloca ptr, align 8
+; CHECK-NEXT:    %cp = alloca ptr, align 8
+; CHECK-NEXT:    store ptr %va, ptr %va.addr, align 8, !tbaa !2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %cp, ptr %va.addr, i32 8, i1 false)
+; CHECK-NEXT:    %0 = load ptr, ptr %cp, align 8, !tbaa !2
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %cp) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %va.addr = alloca ptr, align 8
+  %cp = alloca ptr, align 8
+  store ptr %va, ptr %va.addr, align 8, !tbaa !3
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %cp) #2
+  call void @llvm.va_copy.p0(ptr nonnull %cp, ptr nonnull %va.addr)
+  %0 = load ptr, ptr %cp, align 8, !tbaa !3
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %cp) #2
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+
+declare void @llvm.va_copy.p0(ptr, ptr) #1
+
+declare void @valist(ptr noundef)
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+define void @start_once(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_once(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s = alloca ptr, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %s) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s, align 8
+; CHECK-NEXT:    %0 = load ptr, ptr %s, align 8, !tbaa !2
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %s) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s = alloca ptr, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %s) #2
+  call void @llvm.va_start.p0(ptr nonnull %s)
+  %0 = load ptr, ptr %s, align 8, !tbaa !3
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %s) #2
+  ret void
+}
+
+declare void @llvm.va_start.p0(ptr) #1
+
+declare void @llvm.va_end.p0(ptr) #1
+
+define void @start_twice(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s0 = alloca ptr, align 8
+; CHECK-NEXT:    %s1 = alloca ptr, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %s1) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s0, align 8
+; CHECK-NEXT:    %0 = load ptr, ptr %s0, align 8, !tbaa !2
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s1, align 8
+; CHECK-NEXT:    %1 = load ptr, ptr %s1, align 8, !tbaa !2
+; CHECK-NEXT:    call void @valist(ptr noundef %1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %s0) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s0 = alloca ptr, align 8
+  %s1 = alloca ptr, align 8
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %s0) #2
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %s1) #2
+  call void @llvm.va_start.p0(ptr nonnull %s0)
+  %0 = load ptr, ptr %s0, align 8, !tbaa !3
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s0)
+  call void @llvm.va_start.p0(ptr nonnull %s1)
+  %1 = load ptr, ptr %s1, align 8, !tbaa !3
+  call void @valist(ptr noundef %1) #2
+  call void @llvm.va_end.p0(ptr %s1)
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %s1) #2
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %s0) #2
+  ret void
+}
+
+define void @single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_i32.vararg, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x) #2
+  ret void
+}
+
+declare void @vararg(...)
+
+define void @single_double(double noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_double.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x) #2
+  ret void
+}
+
+define void @single_v4f32(<4 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v8f32(<8 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v8f32(<8 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v16f32(<16 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v16f32(<16 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v32f32(<32 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v32f32(<32 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v32f32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr %0, align 128
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x) #2
+  ret void
+}
+
+define void @i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_double.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store double %y, ptr %1, align 8
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, double noundef %y) #2
+  ret void
+}
+
+define void @double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %double_i32.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    %1 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 8 %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 8 %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_libcS.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %y, i64 32, i1 false)
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(%struct.libcS) align 8 %y) #2
+  ret void
+}
+
+define void @libcS_i32(ptr noundef byval(%struct.libcS) align 8 %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@libcS_i32(ptr noundef byval(%struct.libcS) align 8 %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %libcS_i32.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %0, ptr %x, i64 32, i1 false)
+; CHECK-NEXT:    %1 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(ptr noundef nonnull byval(%struct.libcS) align 8 %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <4 x float> %y, ptr %1, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <4 x float> noundef %y) #2
+  ret void
+}
+
+define void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v4f32_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    %1 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <8 x float> %y, ptr %1, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <8 x float> noundef %y) #2
+  ret void
+}
+
+define void @v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v8f32_i32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    %1 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <16 x float> %y, ptr %1, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <16 x float> noundef %y) #2
+  ret void
+}
+
+define void @v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v16f32_i32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    %1 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v32f32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 256, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <32 x float> %y, ptr %1, align 128
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 256, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <32 x float> noundef %y) #2
+  ret void
+}
+
+define void @v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v32f32_i32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr %0, align 128
+; CHECK-NEXT:    %1 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @indirect_single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@indirect_single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %indirect_single_i32.vararg, align 4
+; CHECK-NEXT:    %0 = load volatile ptr, ptr @vararg_ptr, align 8, !tbaa !2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %indirect_single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    call void %0(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load volatile ptr, ptr @vararg_ptr, align 8, !tbaa !3
+  tail call void (...) %0(i32 noundef %x) #2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { mustprogress }
+
+!llvm.module.flags = !{!0, !1}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 7, !"frame-pointer", i32 2}
+!3 = !{!4, !4, i64 0}
+!4 = !{!"any pointer", !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/CodeGen/X86/expand-variadic-call-i386-darwin.ll b/llvm/test/CodeGen/X86/expand-variadic-call-i386-darwin.ll
new file mode 100644
index 00000000000000..538619c89f2fe3
--- /dev/null
+++ b/llvm/test/CodeGen/X86/expand-variadic-call-i386-darwin.ll
@@ -0,0 +1,449 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
+target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.4.0"
+
+; Check the variables are lowered to the locations this target expects
+
+; The types show the call frames
+; CHECK: %single_i32.vararg = type <{ i32 }>
+; CHECK: %single_double.vararg = type <{ double }>
+; CHECK: %single_v4f32.vararg = type <{ <4 x float> }>
+; CHECK: %single_v8f32.vararg = type <{ <8 x float> }>
+; CHECK: %single_v16f32.vararg = type <{ <16 x float> }>
+; CHECK: %single_v32f32.vararg = type <{ <32 x float> }>
+; CHECK: %i32_double.vararg = type <{ i32, double }>
+; CHECK: %double_i32.vararg = type <{ double, i32 }>
+; CHECK: %i32_v4f32.vararg = type <{ i32, [12 x i8], <4 x float> }>
+; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }>
+; CHECK: %i32_v8f32.vararg = type <{ i32, [28 x i8], <8 x float> }>
+; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }>
+; CHECK: %i32_v16f32.vararg = type <{ i32, [60 x i8], <16 x float> }>
+; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }>
+; CHECK: %i32_v32f32.vararg = type <{ i32, [124 x i8], <32 x float> }>
+; CHECK: %v32f32_i32.vararg = type <{ <32 x float>, i32 }>
+
+%struct.libcS = type { i8, i16, i32, i32, float, double }
+
+define void @copy(ptr noundef %va) {
+; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va.addr = alloca ptr, align 4
+; CHECK-NEXT:    %cp = alloca ptr, align 4
+; CHECK-NEXT:    store ptr %va, ptr %va.addr, align 4, !tbaa !4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %cp, ptr %va.addr, i32 4, i1 false)
+; CHECK-NEXT:    %0 = load ptr, ptr %cp, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %va.addr = alloca ptr, align 4
+  %cp = alloca ptr, align 4
+  store ptr %va, ptr %va.addr, align 4, !tbaa !5
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) #2
+  call void @llvm.va_copy.p0(ptr nonnull %cp, ptr nonnull %va.addr)
+  %0 = load ptr, ptr %cp, align 4, !tbaa !5
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) #2
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+
+declare void @llvm.va_copy.p0(ptr, ptr) #1
+
+declare void @valist(ptr noundef)
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+define void @start_once(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_once(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr %s, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) #2
+  call void @llvm.va_start.p0(ptr nonnull %s)
+  %0 = load ptr, ptr %s, align 4, !tbaa !5
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) #2
+  ret void
+}
+
+declare void @llvm.va_start.p0(ptr) #1
+
+declare void @llvm.va_end.p0(ptr) #1
+
+define void @start_twice(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s0 = alloca ptr, align 4
+; CHECK-NEXT:    %s1 = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s0, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr %s0, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s1, align 4
+; CHECK-NEXT:    %1 = load ptr, ptr %s1, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s0 = alloca ptr, align 4
+  %s1 = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) #2
+  call void @llvm.va_start.p0(ptr nonnull %s0)
+  %0 = load ptr, ptr %s0, align 4, !tbaa !5
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s0)
+  call void @llvm.va_start.p0(ptr nonnull %s1)
+  %1 = load ptr, ptr %s1, align 4, !tbaa !5
+  call void @valist(ptr noundef %1) #2
+  call void @llvm.va_end.p0(ptr %s1)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) #2
+  ret void
+}
+
+define void @single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x) #2
+  ret void
+}
+
+declare void @vararg(...)
+
+define void @single_double(double noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_double.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x) #2
+  ret void
+}
+
+define void @single_v4f32(<4 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v8f32(<8 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v8f32(<8 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v16f32(<16 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v16f32(<16 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v32f32(<32 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v32f32(<32 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v32f32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr %0, align 128
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x) #2
+  ret void
+}
+
+define void @i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_double.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store double %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, double noundef %y) #2
+  ret void
+}
+
+define void @double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %double_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 4 %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 4 %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_libcS.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %y, i64 24, i1 false)
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(%struct.libcS) align 4 %y) #2
+  ret void
+}
+
+define void @libcS_i32(ptr noundef byval(%struct.libcS) align 4 %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@libcS_i32(ptr noundef byval(%struct.libcS) align 4 %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %libcS_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %0, ptr %x, i64 24, i1 false)
+; CHECK-NEXT:    %1 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(ptr noundef nonnull byval(%struct.libcS) align 4 %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <4 x float> %y, ptr %1, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <4 x float> noundef %y) #2
+  ret void
+}
+
+define void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v4f32_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    %1 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <8 x float> %y, ptr %1, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <8 x float> noundef %y) #2
+  ret void
+}
+
+define void @v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v8f32_i32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    %1 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <16 x float> %y, ptr %1, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <16 x float> noundef %y) #2
+  ret void
+}
+
+define void @v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v16f32_i32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    %1 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v32f32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 256, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <32 x float> %y, ptr %1, align 128
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 256, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <32 x float> noundef %y) #2
+  ret void
+}
+
+define void @v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v32f32_i32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr %0, align 128
+; CHECK-NEXT:    %1 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { mustprogress }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = !{i32 1, !"NumRegisterParameters", i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!3 = !{i32 7, !"frame-pointer", i32 2}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"any pointer", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/CodeGen/X86/expand-variadic-call-i386-linux.ll b/llvm/test/CodeGen/X86/expand-variadic-call-i386-linux.ll
new file mode 100644
index 00000000000000..3e06bef4ef5679
--- /dev/null
+++ b/llvm/test/CodeGen/X86/expand-variadic-call-i386-linux.ll
@@ -0,0 +1,449 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
+target datalayout = "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i128:128-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+; Check the variables are lowered to the locations this target expects
+
+; The types show the call frames
+; CHECK: %single_i32.vararg = type <{ i32 }>
+; CHECK: %single_double.vararg = type <{ double }>
+; CHECK: %single_v4f32.vararg = type <{ <4 x float> }>
+; CHECK: %single_v8f32.vararg = type <{ <8 x float> }>
+; CHECK: %single_v16f32.vararg = type <{ <16 x float> }>
+; CHECK: %single_v32f32.vararg = type <{ <32 x float> }>
+; CHECK: %i32_double.vararg = type <{ i32, double }>
+; CHECK: %double_i32.vararg = type <{ double, i32 }>
+; CHECK: %i32_v4f32.vararg = type <{ i32, [12 x i8], <4 x float> }>
+; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }>
+; CHECK: %i32_v8f32.vararg = type <{ i32, [28 x i8], <8 x float> }>
+; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }>
+; CHECK: %i32_v16f32.vararg = type <{ i32, [60 x i8], <16 x float> }>
+; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }>
+; CHECK: %i32_v32f32.vararg = type <{ i32, [124 x i8], <32 x float> }>
+; CHECK: %v32f32_i32.vararg = type <{ <32 x float>, i32 }>
+
+%struct.libcS = type { i8, i16, i32, i32, float, double }
+
+define void @copy(ptr noundef %va) {
+; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va.addr = alloca ptr, align 4
+; CHECK-NEXT:    %cp = alloca ptr, align 4
+; CHECK-NEXT:    store ptr %va, ptr %va.addr, align 4, !tbaa !4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %cp, ptr %va.addr, i32 4, i1 false)
+; CHECK-NEXT:    %0 = load ptr, ptr %cp, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %va.addr = alloca ptr, align 4
+  %cp = alloca ptr, align 4
+  store ptr %va, ptr %va.addr, align 4, !tbaa !5
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) #2
+  call void @llvm.va_copy.p0(ptr nonnull %cp, ptr nonnull %va.addr)
+  %0 = load ptr, ptr %cp, align 4, !tbaa !5
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) #2
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+
+declare void @llvm.va_copy.p0(ptr, ptr) #1
+
+declare void @valist(ptr noundef)
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+define void @start_once(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_once(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr %s, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) #2
+  call void @llvm.va_start.p0(ptr nonnull %s)
+  %0 = load ptr, ptr %s, align 4, !tbaa !5
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) #2
+  ret void
+}
+
+declare void @llvm.va_start.p0(ptr) #1
+
+declare void @llvm.va_end.p0(ptr) #1
+
+define void @start_twice(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s0 = alloca ptr, align 4
+; CHECK-NEXT:    %s1 = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s0, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr %s0, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s1, align 4
+; CHECK-NEXT:    %1 = load ptr, ptr %s1, align 4, !tbaa !4
+; CHECK-NEXT:    call void @valist(ptr noundef %1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s0 = alloca ptr, align 4
+  %s1 = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) #2
+  call void @llvm.va_start.p0(ptr nonnull %s0)
+  %0 = load ptr, ptr %s0, align 4, !tbaa !5
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s0)
+  call void @llvm.va_start.p0(ptr nonnull %s1)
+  %1 = load ptr, ptr %s1, align 4, !tbaa !5
+  call void @valist(ptr noundef %1) #2
+  call void @llvm.va_end.p0(ptr %s1)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) #2
+  ret void
+}
+
+define void @single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x) #2
+  ret void
+}
+
+declare void @vararg(...)
+
+define void @single_double(double noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_double.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x) #2
+  ret void
+}
+
+define void @single_v4f32(<4 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v8f32(<8 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v8f32(<8 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v16f32(<16 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v16f32(<16 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v32f32(<32 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v32f32(<32 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v32f32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr %0, align 128
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x) #2
+  ret void
+}
+
+define void @i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_double.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store double %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, double noundef %y) #2
+  ret void
+}
+
+define void @double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %double_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 4 %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 4 %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_libcS.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %y, i64 24, i1 false)
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(%struct.libcS) align 4 %y) #2
+  ret void
+}
+
+define void @libcS_i32(ptr noundef byval(%struct.libcS) align 4 %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@libcS_i32(ptr noundef byval(%struct.libcS) align 4 %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %libcS_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %0, ptr %x, i64 24, i1 false)
+; CHECK-NEXT:    %1 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(ptr noundef nonnull byval(%struct.libcS) align 4 %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <4 x float> %y, ptr %1, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <4 x float> noundef %y) #2
+  ret void
+}
+
+define void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v4f32_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    %1 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, <8 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <8 x float> %y, ptr %1, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <8 x float> noundef %y) #2
+  ret void
+}
+
+define void @v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(<8 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v8f32_i32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    %1 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, <16 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <16 x float> %y, ptr %1, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <16 x float> noundef %y) #2
+  ret void
+}
+
+define void @v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(<16 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v16f32_i32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    %1 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, <32 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v32f32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 256, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <32 x float> %y, ptr %1, align 128
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 256, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <32 x float> noundef %y) #2
+  ret void
+}
+
+define void @v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(<32 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v32f32_i32.vararg, align 128
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <32 x float> %x, ptr %0, align 128
+; CHECK-NEXT:    %1 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<32 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { mustprogress }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = !{i32 1, !"NumRegisterParameters", i32 0}
+!1 = !{i32 1, !"wchar_size", i32 4}
+!2 = !{i32 8, !"PIC Level", i32 2}
+!3 = !{i32 7, !"PIE Level", i32 2}
+!5 = !{!6, !6, i64 0}
+!6 = !{!"any pointer", !7, i64 0}
+!7 = !{!"omnipotent char", !8, i64 0}
+!8 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/CodeGen/X86/expand-variadic-call-i686-msvc.ll b/llvm/test/CodeGen/X86/expand-variadic-call-i686-msvc.ll
new file mode 100644
index 00000000000000..564fc783a92656
--- /dev/null
+++ b/llvm/test/CodeGen/X86/expand-variadic-call-i686-msvc.ll
@@ -0,0 +1,467 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
+target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32-a:0:32-S32"
+target triple = "i686-unknown-windows-msvc19.33.0"
+
+; Check the variables are lowered to the locations this target expects
+
+; The types show the call frames
+; CHECK: %single_i32.vararg = type <{ i32 }>
+; CHECK: %single_double.vararg = type <{ double }>
+; CHECK: %single_v4f32.vararg = type <{ <4 x float> }>
+; CHECK: %single_v8f32.vararg = type <{ <8 x float> }>
+; CHECK: %single_v16f32.vararg = type <{ <16 x float> }>
+; CHECK: %single_v32f32.vararg = type <{ ptr }>
+; CHECK: %i32_double.vararg = type <{ i32, [4 x i8], double }>
+; CHECK: %double_i32.vararg = type <{ double, i32 }>
+; CHECK: %i32_v4f32.vararg = type <{ i32, [12 x i8], <4 x float> }>
+; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }>
+; CHECK: %i32_v8f32.vararg = type <{ i32, [28 x i8], <8 x float> }>
+; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }>
+; CHECK: %i32_v16f32.vararg = type <{ i32, [60 x i8], <16 x float> }>
+; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }>
+; CHECK: %i32_v32f32.vararg = type <{ i32, ptr }>
+; CHECK: %v32f32_i32.vararg = type <{ ptr, i32 }>
+
+%struct.libcS = type { i8, i16, i32, i32, float, double }
+
+define void @copy(ptr noundef %va) {
+; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va.addr = alloca ptr, align 4
+; CHECK-NEXT:    %cp = alloca ptr, align 4
+; CHECK-NEXT:    store ptr %va, ptr %va.addr, align 4, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %cp, ptr %va.addr, i32 4, i1 false)
+; CHECK-NEXT:    %0 = load ptr, ptr %cp, align 4, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %va.addr = alloca ptr, align 4
+  %cp = alloca ptr, align 4
+  store ptr %va, ptr %va.addr, align 4, !tbaa !4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %cp) #2
+  call void @llvm.va_copy.p0(ptr nonnull %cp, ptr nonnull %va.addr)
+  %0 = load ptr, ptr %cp, align 4, !tbaa !4
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %cp) #2
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+
+declare void @llvm.va_copy.p0(ptr, ptr) #1
+
+declare void @valist(ptr noundef)
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+define void @start_once(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_once(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr %s, align 4, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s) #2
+  call void @llvm.va_start.p0(ptr nonnull %s)
+  %0 = load ptr, ptr %s, align 4, !tbaa !4
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s) #2
+  ret void
+}
+
+declare void @llvm.va_start.p0(ptr) #1
+
+declare void @llvm.va_end.p0(ptr) #1
+
+define void @start_twice(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s0 = alloca ptr, align 4
+; CHECK-NEXT:    %s1 = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s0, align 4
+; CHECK-NEXT:    %0 = load ptr, ptr %s0, align 4, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %0) #2
+; CHECK-NEXT:    store ptr %varargs, ptr %s1, align 4
+; CHECK-NEXT:    %1 = load ptr, ptr %s1, align 4, !tbaa !3
+; CHECK-NEXT:    call void @valist(ptr noundef %1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s0 = alloca ptr, align 4
+  %s1 = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s0) #2
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %s1) #2
+  call void @llvm.va_start.p0(ptr nonnull %s0)
+  %0 = load ptr, ptr %s0, align 4, !tbaa !4
+  call void @valist(ptr noundef %0) #2
+  call void @llvm.va_end.p0(ptr %s0)
+  call void @llvm.va_start.p0(ptr nonnull %s1)
+  %1 = load ptr, ptr %s1, align 4, !tbaa !4
+  call void @valist(ptr noundef %1) #2
+  call void @llvm.va_end.p0(ptr %s1)
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s1) #2
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %s0) #2
+  ret void
+}
+
+define void @single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_i32.vararg, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x) #2
+  ret void
+}
+
+declare void @vararg(...)
+
+define void @single_double(double noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_double.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x) #2
+  ret void
+}
+
+define void @single_v4f32(<4 x float> inreg noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> inreg noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> inreg noundef %x) #2
+  ret void
+}
+
+define void @single_v8f32(<8 x float> inreg noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v8f32(<8 x float> inreg noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> inreg noundef %x) #2
+  ret void
+}
+
+define void @single_v16f32(<16 x float> inreg noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v16f32(<16 x float> inreg noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> inreg noundef %x) #2
+  ret void
+}
+
+define void @single_v32f32(ptr nocapture noundef readonly %0) {
+; CHECK-LABEL: define {{[^@]+}}@single_v32f32(ptr nocapture noundef readonly %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 128
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v32f32.vararg, align 4
+; CHECK-NEXT:    %x = load <32 x float>, ptr %0, align 128, !tbaa !7
+; CHECK-NEXT:    store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !7
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %single_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store ptr %indirect-arg-temp, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 128
+  %x = load <32 x float>, ptr %0, align 128, !tbaa !8
+  store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !8
+  call void (...) @vararg(ptr noundef nonnull %indirect-arg-temp) #2
+  ret void
+}
+
+define void @i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_double.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store double %y, ptr %1, align 8
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, double noundef %y) #2
+  ret void
+}
+
+define void @double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %double_i32.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    %1 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_libcS(i32 noundef %x, ptr nocapture noundef readonly byval(%struct.libcS) align 4 %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, ptr nocapture noundef readonly byval(%struct.libcS) align 4 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_libcS.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %0, i64 24, i1 false)
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(%struct.libcS) align 4 %0) #2
+  ret void
+}
+
+define void @libcS_i32(ptr nocapture noundef readonly byval(%struct.libcS) align 4 %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@libcS_i32(ptr nocapture noundef readonly byval(%struct.libcS) align 4 %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %libcS_i32.vararg, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %0, i64 24, i1 false)
+; CHECK-NEXT:    %2 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 28, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(ptr noundef nonnull byval(%struct.libcS) align 4 %0, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v4f32(i32 noundef %x, <4 x float> inreg noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> inreg noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v4f32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <4 x float> %y, ptr %1, align 16
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <4 x float> inreg noundef %y) #2
+  ret void
+}
+
+define void @v4f32_i32(<4 x float> inreg noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> inreg noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v4f32_i32.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    %1 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> inreg noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v8f32(i32 noundef %x, <8 x float> inreg noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, <8 x float> inreg noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v8f32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <8 x float> %y, ptr %1, align 32
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <8 x float> inreg noundef %y) #2
+  ret void
+}
+
+define void @v8f32_i32(<8 x float> inreg noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(<8 x float> inreg noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v8f32_i32.vararg, align 32
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <8 x float> %x, ptr %0, align 32
+; CHECK-NEXT:    %1 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<8 x float> inreg noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v16f32(i32 noundef %x, <16 x float> inreg noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, <16 x float> inreg noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v16f32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <16 x float> %y, ptr %1, align 64
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <16 x float> inreg noundef %y) #2
+  ret void
+}
+
+define void @v16f32_i32(<16 x float> inreg noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(<16 x float> inreg noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v16f32_i32.vararg, align 64
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <16 x float> %x, ptr %0, align 64
+; CHECK-NEXT:    %1 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<16 x float> inreg noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v32f32(i32 noundef %x, ptr nocapture noundef readonly %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, ptr nocapture noundef readonly %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 128
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v32f32.vararg, align 4
+; CHECK-NEXT:    %y = load <32 x float>, ptr %0, align 128, !tbaa !7
+; CHECK-NEXT:    store <32 x float> %y, ptr %indirect-arg-temp, align 128, !tbaa !7
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store ptr %indirect-arg-temp, ptr %2, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 128
+  %y = load <32 x float>, ptr %0, align 128, !tbaa !8
+  store <32 x float> %y, ptr %indirect-arg-temp, align 128, !tbaa !8
+  call void (...) @vararg(i32 noundef %x, ptr noundef nonnull %indirect-arg-temp) #2
+  ret void
+}
+
+define void @v32f32_i32(ptr nocapture noundef readonly %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(ptr nocapture noundef readonly %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 128
+; CHECK-NEXT:    %vararg_buffer = alloca %v32f32_i32.vararg, align 4
+; CHECK-NEXT:    %x = load <32 x float>, ptr %0, align 128, !tbaa !7
+; CHECK-NEXT:    store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !7
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store ptr %indirect-arg-temp, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @vararg(ptr %vararg_buffer) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 128
+  %x = load <32 x float>, ptr %0, align 128, !tbaa !8
+  store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !8
+  call void (...) @vararg(ptr noundef nonnull %indirect-arg-temp, i32 noundef %y) #2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { mustprogress }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"NumRegisterParameters", i32 0}
+!1 = !{i32 1, !"wchar_size", i32 2}
+!2 = !{i32 1, !"MaxTLSAlign", i32 65536}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"any pointer", !6, i64 0}
+!6 = !{!"omnipotent char", !7, i64 0}
+!7 = !{!"Simple C/C++ TBAA"}
+!8 = !{!6, !6, i64 0}
diff --git a/llvm/test/CodeGen/X86/expand-variadic-call-x64-darwin.ll b/llvm/test/CodeGen/X86/expand-variadic-call-x64-darwin.ll
new file mode 100644
index 00000000000000..b1437edbe9a5a0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/expand-variadic-call-x64-darwin.ll
@@ -0,0 +1,688 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.4.0"
+
+; Check the variables are lowered to the locations this target expects
+
+; The types show the call frames
+; CHECK: %single_i32.vararg = type <{ i32 }>
+; CHECK: %single_double.vararg = type <{ double }>
+; CHECK: %single_v4f32.vararg = type <{ <4 x float> }>
+; CHECK: %single_v8f32.vararg = type <{ <8 x float> }>
+; CHECK: %single_v16f32.vararg = type <{ <16 x float> }>
+; CHECK: %single_v32f32.vararg = type <{ <32 x float> }>
+; CHECK: %i32_double.vararg = type <{ i32, [4 x i8], double }>
+; CHECK: %double_i32.vararg = type <{ double, i32 }>
+; CHECK: %i32_v4f32.vararg = type <{ i32, [4 x i8], <4 x float> }>
+; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }>
+; CHECK: %i32_v8f32.vararg = type <{ i32, [4 x i8], <8 x float> }>
+; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }>
+; CHECK: %i32_v16f32.vararg = type <{ i32, [4 x i8], <16 x float> }>
+; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }>
+; CHECK: %i32_v32f32.vararg = type <{ i32, [4 x i8], <32 x float> }>
+; CHECK: %v32f32_i32.vararg = type <{ <32 x float>, i32 }>
+
+%struct.__va_list_tag = type { i32, i32, ptr, ptr }
+%struct.libcS = type { i8, i16, i32, i64, float, double }
+
+define void @copy(ptr noundef %va) {
+; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %cp = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %cp, ptr %va, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %cp) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cp = alloca [1 x %struct.__va_list_tag], align 16
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %cp) #2
+  call void @llvm.va_copy.p0(ptr nonnull %cp, ptr %va)
+  call void @valist(ptr noundef nonnull %cp) #2
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %cp) #2
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+
+declare void @llvm.va_copy.p0(ptr, ptr) #1
+
+declare void @valist(ptr noundef)
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+define void @start_once(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_once(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %s, ptr %varargs, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %s) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s = alloca [1 x %struct.__va_list_tag], align 16
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s) #2
+  call void @llvm.va_start.p0(ptr nonnull %s)
+  call void @valist(ptr noundef nonnull %s) #2
+  call void @llvm.va_end.p0(ptr %s)
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s) #2
+  ret void
+}
+
+declare void @llvm.va_start.p0(ptr) #1
+
+declare void @llvm.va_end.p0(ptr) #1
+
+define void @start_twice(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s0 = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    %s1 = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %s0, ptr %varargs, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %s1, ptr %varargs, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s0) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s0 = alloca [1 x %struct.__va_list_tag], align 16
+  %s1 = alloca [1 x %struct.__va_list_tag], align 16
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s0) #2
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s1) #2
+  call void @llvm.va_start.p0(ptr nonnull %s0)
+  call void @valist(ptr noundef nonnull %s0) #2
+  call void @llvm.va_end.p0(ptr %s0)
+  call void @llvm.va_start.p0(ptr nonnull %s1)
+  call void @valist(ptr noundef nonnull %s1) #2
+  call void @llvm.va_end.p0(ptr %s1)
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s1) #2
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s0) #2
+  ret void
+}
+
+define void @single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x) #2
+  ret void
+}
+
+declare void @vararg(...)
+
+define void @single_double(double noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_double.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x) #2
+  ret void
+}
+
+define void @single_v4f32(<4 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v4f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v8f32(ptr nocapture noundef readonly byval(<8 x float>) align 16 %0) {
+; CHECK-LABEL: define {{[^@]+}}@single_v8f32(ptr nocapture noundef readonly byval(<8 x float>) align 16 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <8 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v8f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <8 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <8 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %single_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <8 x float>, align 16
+  %x = load <8 x float>, ptr %0, align 16, !tbaa !4
+  store <8 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<8 x float>) align 16 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @single_v16f32(ptr nocapture noundef readonly byval(<16 x float>) align 16 %0) {
+; CHECK-LABEL: define {{[^@]+}}@single_v16f32(ptr nocapture noundef readonly byval(<16 x float>) align 16 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <16 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v16f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <16 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <16 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %single_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <16 x float>, align 16
+  %x = load <16 x float>, ptr %0, align 16, !tbaa !4
+  store <16 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<16 x float>) align 16 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @single_v32f32(ptr nocapture noundef readonly byval(<32 x float>) align 16 %0) {
+; CHECK-LABEL: define {{[^@]+}}@single_v32f32(ptr nocapture noundef readonly byval(<32 x float>) align 16 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v32f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <32 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <32 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %single_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 16
+  %x = load <32 x float>, ptr %0, align 16, !tbaa !4
+  store <32 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<32 x float>) align 16 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_double.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store double %y, ptr %1, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, double noundef %y) #2
+  ret void
+}
+
+define void @double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %double_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    %1 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 8 %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 8 %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_libcS.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %y, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(%struct.libcS) align 8 %y) #2
+  ret void
+}
+
+define void @libcS_i32(ptr noundef byval(%struct.libcS) align 8 %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@libcS_i32(ptr noundef byval(%struct.libcS) align 8 %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %libcS_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %0, ptr %x, i64 32, i1 false)
+; CHECK-NEXT:    %1 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(ptr noundef nonnull byval(%struct.libcS) align 8 %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v4f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <4 x float> %y, ptr %1, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <4 x float> noundef %y) #2
+  ret void
+}
+
+define void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v4f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    %1 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v8f32(i32 noundef %x, ptr nocapture noundef readonly byval(<8 x float>) align 16 %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, ptr nocapture noundef readonly byval(<8 x float>) align 16 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <8 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v8f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %y = load <8 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <8 x float> %y, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %indirect-arg-temp, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <8 x float>, align 16
+  %y = load <8 x float>, ptr %0, align 16, !tbaa !4
+  store <8 x float> %y, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(<8 x float>) align 16 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @v8f32_i32(ptr nocapture noundef readonly byval(<8 x float>) align 16 %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(ptr nocapture noundef readonly byval(<8 x float>) align 16 %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <8 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %v8f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <8 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <8 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 32, i1 false)
+; CHECK-NEXT:    %2 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <8 x float>, align 16
+  %x = load <8 x float>, ptr %0, align 16, !tbaa !4
+  store <8 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<8 x float>) align 16 %indirect-arg-temp, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v16f32(i32 noundef %x, ptr nocapture noundef readonly byval(<16 x float>) align 16 %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, ptr nocapture noundef readonly byval(<16 x float>) align 16 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <16 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v16f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %y = load <16 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <16 x float> %y, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 72, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %indirect-arg-temp, i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 72, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <16 x float>, align 16
+  %y = load <16 x float>, ptr %0, align 16, !tbaa !4
+  store <16 x float> %y, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(<16 x float>) align 16 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @v16f32_i32(ptr nocapture noundef readonly byval(<16 x float>) align 16 %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(ptr nocapture noundef readonly byval(<16 x float>) align 16 %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <16 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %v16f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <16 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <16 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 64, i1 false)
+; CHECK-NEXT:    %2 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <16 x float>, align 16
+  %x = load <16 x float>, ptr %0, align 16, !tbaa !4
+  store <16 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<16 x float>) align 16 %indirect-arg-temp, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v32f32(i32 noundef %x, ptr nocapture noundef readonly byval(<32 x float>) align 16 %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, ptr nocapture noundef readonly byval(<32 x float>) align 16 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v32f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %y = load <32 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <32 x float> %y, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 136, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %indirect-arg-temp, i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 136, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 16
+  %y = load <32 x float>, ptr %0, align 16, !tbaa !4
+  store <32 x float> %y, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(<32 x float>) align 16 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @v32f32_i32(ptr nocapture noundef readonly byval(<32 x float>) align 16 %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(ptr nocapture noundef readonly byval(<32 x float>) align 16 %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 16
+; CHECK-NEXT:    %vararg_buffer = alloca %v32f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <32 x float>, ptr %0, align 16, !tbaa !3
+; CHECK-NEXT:    store <32 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 128, i1 false)
+; CHECK-NEXT:    %2 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 16
+  %x = load <32 x float>, ptr %0, align 16, !tbaa !4
+  store <32 x float> %x, ptr %indirect-arg-temp, align 16, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<32 x float>) align 16 %indirect-arg-temp, i32 noundef %y) #2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { mustprogress }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"frame-pointer", i32 2}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/CodeGen/X86/expand-variadic-call-x64-linux.ll b/llvm/test/CodeGen/X86/expand-variadic-call-x64-linux.ll
new file mode 100644
index 00000000000000..c87f5844ecf80c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/expand-variadic-call-x64-linux.ll
@@ -0,0 +1,688 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Check the variables are lowered to the locations this target expects
+
+; The types show the call frames
+; CHECK: %single_i32.vararg = type <{ i32 }>
+; CHECK: %single_double.vararg = type <{ double }>
+; CHECK: %single_v4f32.vararg = type <{ <4 x float> }>
+; CHECK: %single_v8f32.vararg = type <{ <8 x float> }>
+; CHECK: %single_v16f32.vararg = type <{ <16 x float> }>
+; CHECK: %single_v32f32.vararg = type <{ <32 x float> }>
+; CHECK: %i32_double.vararg = type <{ i32, [4 x i8], double }>
+; CHECK: %double_i32.vararg = type <{ double, i32 }>
+; CHECK: %i32_v4f32.vararg = type <{ i32, [4 x i8], <4 x float> }>
+; CHECK: %v4f32_i32.vararg = type <{ <4 x float>, i32 }>
+; CHECK: %i32_v8f32.vararg = type <{ i32, [4 x i8], <8 x float> }>
+; CHECK: %v8f32_i32.vararg = type <{ <8 x float>, i32 }>
+; CHECK: %i32_v16f32.vararg = type <{ i32, [4 x i8], <16 x float> }>
+; CHECK: %v16f32_i32.vararg = type <{ <16 x float>, i32 }>
+; CHECK: %i32_v32f32.vararg = type <{ i32, [4 x i8], <32 x float> }>
+; CHECK: %v32f32_i32.vararg = type <{ <32 x float>, i32 }>
+
+%struct.__va_list_tag = type { i32, i32, ptr, ptr }
+%struct.libcS = type { i8, i16, i32, i64, float, double }
+
+define void @copy(ptr noundef %va) {
+; CHECK-LABEL: define {{[^@]+}}@copy(ptr noundef %va) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %cp = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %cp, ptr %va, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %cp) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %cp) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cp = alloca [1 x %struct.__va_list_tag], align 16
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %cp) #2
+  call void @llvm.va_copy.p0(ptr nonnull %cp, ptr %va)
+  call void @valist(ptr noundef nonnull %cp) #2
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %cp) #2
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0
+
+declare void @llvm.va_copy.p0(ptr, ptr) #1
+
+declare void @valist(ptr noundef)
+
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #0
+
+define void @start_once(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_once(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %s, ptr %varargs, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %s) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s = alloca [1 x %struct.__va_list_tag], align 16
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s) #2
+  call void @llvm.va_start.p0(ptr nonnull %s)
+  call void @valist(ptr noundef nonnull %s) #2
+  call void @llvm.va_end.p0(ptr %s)
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s) #2
+  ret void
+}
+
+declare void @llvm.va_start.p0(ptr) #1
+
+declare void @llvm.va_end.p0(ptr) #1
+
+define void @start_twice(...) {
+; CHECK-LABEL: define {{[^@]+}}@start_twice(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %s0 = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    %s1 = alloca [1 x %struct.__va_list_tag], align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %s0, ptr %varargs, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %s0) #2
+; CHECK-NEXT:    call void @llvm.memcpy.inline.p0.p0.i32(ptr %s1, ptr %varargs, i32 24, i1 false)
+; CHECK-NEXT:    call void @valist(ptr noundef nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s1) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s0) #2
+; CHECK-NEXT:    ret void
+;
+entry:
+  %s0 = alloca [1 x %struct.__va_list_tag], align 16
+  %s1 = alloca [1 x %struct.__va_list_tag], align 16
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s0) #2
+  call void @llvm.lifetime.start.p0(i64 24, ptr nonnull %s1) #2
+  call void @llvm.va_start.p0(ptr nonnull %s0)
+  call void @valist(ptr noundef nonnull %s0) #2
+  call void @llvm.va_end.p0(ptr %s0)
+  call void @llvm.va_start.p0(ptr nonnull %s1)
+  call void @valist(ptr noundef nonnull %s1) #2
+  call void @llvm.va_end.p0(ptr %s1)
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s1) #2
+  call void @llvm.lifetime.end.p0(i64 24, ptr nonnull %s0) #2
+  ret void
+}
+
+define void @single_i32(i32 noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_i32(i32 noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x) #2
+  ret void
+}
+
+declare void @vararg(...)
+
+define void @single_double(double noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_double(double noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_double.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x) #2
+  ret void
+}
+
+define void @single_v4f32(<4 x float> noundef %x) {
+; CHECK-LABEL: define {{[^@]+}}@single_v4f32(<4 x float> noundef %x) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v4f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %single_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x) #2
+  ret void
+}
+
+define void @single_v8f32(ptr nocapture noundef readonly byval(<8 x float>) align 32 %0) {
+; CHECK-LABEL: define {{[^@]+}}@single_v8f32(ptr nocapture noundef readonly byval(<8 x float>) align 32 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <8 x float>, align 32
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v8f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <8 x float>, ptr %0, align 32, !tbaa !3
+; CHECK-NEXT:    store <8 x float> %x, ptr %indirect-arg-temp, align 32, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %single_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <8 x float>, align 32
+  %x = load <8 x float>, ptr %0, align 32, !tbaa !4
+  store <8 x float> %x, ptr %indirect-arg-temp, align 32, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<8 x float>) align 32 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @single_v16f32(ptr nocapture noundef readonly byval(<16 x float>) align 64 %0) {
+; CHECK-LABEL: define {{[^@]+}}@single_v16f32(ptr nocapture noundef readonly byval(<16 x float>) align 64 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <16 x float>, align 64
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v16f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <16 x float>, ptr %0, align 64, !tbaa !3
+; CHECK-NEXT:    store <16 x float> %x, ptr %indirect-arg-temp, align 64, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %single_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 64, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <16 x float>, align 64
+  %x = load <16 x float>, ptr %0, align 64, !tbaa !4
+  store <16 x float> %x, ptr %indirect-arg-temp, align 64, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<16 x float>) align 64 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @single_v32f32(ptr nocapture noundef readonly byval(<32 x float>) align 128 %0) {
+; CHECK-LABEL: define {{[^@]+}}@single_v32f32(ptr nocapture noundef readonly byval(<32 x float>) align 128 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 128
+; CHECK-NEXT:    %vararg_buffer = alloca %single_v32f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <32 x float>, ptr %0, align 128, !tbaa !3
+; CHECK-NEXT:    store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %single_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 128, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 128
+  %x = load <32 x float>, ptr %0, align 128, !tbaa !4
+  store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<32 x float>) align 128 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_double(i32 noundef %x, double noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_double.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_double.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store double %y, ptr %1, align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 16, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, double noundef %y) #2
+  ret void
+}
+
+define void @double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@double_i32(double noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %double_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store double %x, ptr %0, align 8
+; CHECK-NEXT:    %1 = getelementptr inbounds %double_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(double noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 8 %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_libcS(i32 noundef %x, ptr noundef byval(%struct.libcS) align 8 %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_libcS.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_libcS.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %y, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(%struct.libcS) align 8 %y) #2
+  ret void
+}
+
+define void @libcS_i32(ptr noundef byval(%struct.libcS) align 8 %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@libcS_i32(ptr noundef byval(%struct.libcS) align 8 %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %libcS_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %0, ptr %x, i64 32, i1 false)
+; CHECK-NEXT:    %1 = getelementptr inbounds %libcS_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(ptr noundef nonnull byval(%struct.libcS) align 8 %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v4f32(i32 noundef %x, <4 x float> noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v4f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v4f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    store <4 x float> %y, ptr %1, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(i32 noundef %x, <4 x float> noundef %y) #2
+  ret void
+}
+
+define void @v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v4f32_i32(<4 x float> noundef %x, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %v4f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store <4 x float> %x, ptr %0, align 16
+; CHECK-NEXT:    %1 = getelementptr inbounds %v4f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %1, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  tail call void (...) @vararg(<4 x float> noundef %x, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v8f32(i32 noundef %x, ptr nocapture noundef readonly byval(<8 x float>) align 32 %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v8f32(i32 noundef %x, ptr nocapture noundef readonly byval(<8 x float>) align 32 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <8 x float>, align 32
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v8f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %y = load <8 x float>, ptr %0, align 32, !tbaa !3
+; CHECK-NEXT:    store <8 x float> %y, ptr %indirect-arg-temp, align 32, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_v8f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %indirect-arg-temp, i64 32, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 40, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <8 x float>, align 32
+  %y = load <8 x float>, ptr %0, align 32, !tbaa !4
+  store <8 x float> %y, ptr %indirect-arg-temp, align 32, !tbaa !4
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(<8 x float>) align 32 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @v8f32_i32(ptr nocapture noundef readonly byval(<8 x float>) align 32 %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v8f32_i32(ptr nocapture noundef readonly byval(<8 x float>) align 32 %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <8 x float>, align 32
+; CHECK-NEXT:    %vararg_buffer = alloca %v8f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <8 x float>, ptr %0, align 32, !tbaa !3
+; CHECK-NEXT:    store <8 x float> %x, ptr %indirect-arg-temp, align 32, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 32, i1 false)
+; CHECK-NEXT:    %2 = getelementptr inbounds %v8f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 36, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <8 x float>, align 32
+  %x = load <8 x float>, ptr %0, align 32, !tbaa !4
+  store <8 x float> %x, ptr %indirect-arg-temp, align 32, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<8 x float>) align 32 %indirect-arg-temp, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v16f32(i32 noundef %x, ptr nocapture noundef readonly byval(<16 x float>) align 64 %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v16f32(i32 noundef %x, ptr nocapture noundef readonly byval(<16 x float>) align 64 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <16 x float>, align 64
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v16f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %y = load <16 x float>, ptr %0, align 64, !tbaa !3
+; CHECK-NEXT:    store <16 x float> %y, ptr %indirect-arg-temp, align 64, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 72, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_v16f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %indirect-arg-temp, i64 64, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 72, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <16 x float>, align 64
+  %y = load <16 x float>, ptr %0, align 64, !tbaa !4
+  store <16 x float> %y, ptr %indirect-arg-temp, align 64, !tbaa !4
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(<16 x float>) align 64 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @v16f32_i32(ptr nocapture noundef readonly byval(<16 x float>) align 64 %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v16f32_i32(ptr nocapture noundef readonly byval(<16 x float>) align 64 %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <16 x float>, align 64
+; CHECK-NEXT:    %vararg_buffer = alloca %v16f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <16 x float>, ptr %0, align 64, !tbaa !3
+; CHECK-NEXT:    store <16 x float> %x, ptr %indirect-arg-temp, align 64, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 64, i1 false)
+; CHECK-NEXT:    %2 = getelementptr inbounds %v16f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 68, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <16 x float>, align 64
+  %x = load <16 x float>, ptr %0, align 64, !tbaa !4
+  store <16 x float> %x, ptr %indirect-arg-temp, align 64, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<16 x float>) align 64 %indirect-arg-temp, i32 noundef %y) #2
+  ret void
+}
+
+define void @i32_v32f32(i32 noundef %x, ptr nocapture noundef readonly byval(<32 x float>) align 128 %0) {
+; CHECK-LABEL: define {{[^@]+}}@i32_v32f32(i32 noundef %x, ptr nocapture noundef readonly byval(<32 x float>) align 128 %0) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 128
+; CHECK-NEXT:    %vararg_buffer = alloca %i32_v32f32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %y = load <32 x float>, ptr %0, align 128, !tbaa !3
+; CHECK-NEXT:    store <32 x float> %y, ptr %indirect-arg-temp, align 128, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 136, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %1, align 4
+; CHECK-NEXT:    %2 = getelementptr inbounds %i32_v32f32.vararg, ptr %vararg_buffer, i32 0, i32 2
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %2, ptr %indirect-arg-temp, i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 136, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 128
+  %y = load <32 x float>, ptr %0, align 128, !tbaa !4
+  store <32 x float> %y, ptr %indirect-arg-temp, align 128, !tbaa !4
+  tail call void (...) @vararg(i32 noundef %x, ptr noundef nonnull byval(<32 x float>) align 128 %indirect-arg-temp) #2
+  ret void
+}
+
+define void @v32f32_i32(ptr nocapture noundef readonly byval(<32 x float>) align 128 %0, i32 noundef %y) {
+; CHECK-LABEL: define {{[^@]+}}@v32f32_i32(ptr nocapture noundef readonly byval(<32 x float>) align 128 %0, i32 noundef %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %indirect-arg-temp = alloca <32 x float>, align 128
+; CHECK-NEXT:    %vararg_buffer = alloca %v32f32_i32.vararg, align 16
+; CHECK-NEXT:    %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; CHECK-NEXT:    %x = load <32 x float>, ptr %0, align 128, !tbaa !3
+; CHECK-NEXT:    store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !3
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    %1 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr %1, ptr %indirect-arg-temp, i64 128, i1 false)
+; CHECK-NEXT:    %2 = getelementptr inbounds %v32f32_i32.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store i32 %y, ptr %2, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    %gp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 0
+; CHECK-NEXT:    store i32 48, ptr %gp_offset, align 4
+; CHECK-NEXT:    %fp_offset = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 1
+; CHECK-NEXT:    store i32 176, ptr %fp_offset, align 4
+; CHECK-NEXT:    %overfow_arg_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 2
+; CHECK-NEXT:    store ptr %vararg_buffer, ptr %overfow_arg_area, align 8
+; CHECK-NEXT:    %reg_save_area = getelementptr inbounds [1 x { i32, i32, ptr, ptr }], ptr %va_list, i64 0, i32 0, i32 3
+; CHECK-NEXT:    store ptr null, ptr %reg_save_area, align 8
+; CHECK-NEXT:    call void @vararg(ptr %va_list) #2
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 24, ptr %va_list)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 132, ptr %vararg_buffer)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %indirect-arg-temp = alloca <32 x float>, align 128
+  %x = load <32 x float>, ptr %0, align 128, !tbaa !4
+  store <32 x float> %x, ptr %indirect-arg-temp, align 128, !tbaa !4
+  tail call void (...) @vararg(ptr noundef nonnull byval(<32 x float>) align 128 %indirect-arg-temp, i32 noundef %y) #2
+  ret void
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+attributes #1 = { nocallback nofree nosync nounwind willreturn }
+attributes #2 = { mustprogress }
+
+!llvm.module.flags = !{!0, !1, !2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"PIE Level", i32 2}
+!4 = !{!5, !5, i64 0}
+!5 = !{!"omnipotent char", !6, i64 0}
+!6 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Other/new-pm-defaults.ll b/llvm/test/Other/new-pm-defaults.ll
index 51fb93daa4dfa6..54c593c98ab669 100644
--- a/llvm/test/Other/new-pm-defaults.ll
+++ b/llvm/test/Other/new-pm-defaults.ll
@@ -125,6 +125,7 @@
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-EP-PEEPHOLE-NEXT: Running pass: NoOpFunctionPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
index 064362eabbf839..fa9c0cc7708e60 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-defaults.ll
@@ -61,6 +61,7 @@
 ; CHECK-O-NEXT: Running analysis: TypeBasedAA
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-PRELINK-O-NEXT: Running analysis: ProfileSummaryAnalysis
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
index 19a44867e434ac..8bd372f4c06910 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-pgo-defaults.ll
@@ -50,6 +50,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
index ac80a31d8fd4bc..dd99340e60c0d3 100644
--- a/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-postlink-samplepgo-defaults.ll
@@ -58,7 +58,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass on foo
-
+; CHECK-O-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
index 6486639e07b49c..d2a53cea68c37b 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-defaults.ll
@@ -92,6 +92,7 @@
 ; CHECK-O-NEXT: Running analysis: TypeBasedAA
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
+; CHECK-O-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running analysis: ProfileSummaryAnalysis
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
index 09f9f0f48baddb..cc110b01fb2445 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-pgo-defaults.ll
@@ -83,6 +83,7 @@
 ; CHECK-O-NEXT: Running pass: PGOIndirectCallPromotion on
 ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
 ; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis on foo
+; CHECK-O-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
diff --git a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
index 47bdbfd2d357d4..d2c547e04085e3 100644
--- a/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
+++ b/llvm/test/Other/new-pm-thinlto-prelink-samplepgo-defaults.ll
@@ -63,6 +63,7 @@
 ; CHECK-O-NEXT: Running analysis: LoopAnalysis on foo
 ; CHECK-O-NEXT: Running analysis: PostDominatorTreeAnalysis on foo
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass on foo
+; CHECK-O-NEXT: Running pass: ExpandVariadicsPass
 ; CHECK-O-NEXT: Running pass: AlwaysInlinerPass
 ; CHECK-O-NEXT: Running pass: ModuleInlinerWrapperPass
 ; CHECK-O-NEXT: Running analysis: InlineAdvisorAnalysis
diff --git a/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-outliner.ll b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-outliner.ll
new file mode 100644
index 00000000000000..59d8ceed14f64d
--- /dev/null
+++ b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-outliner.ll
@@ -0,0 +1,86 @@
+; RUN: opt -mtriple=i386-unknown-linux-gnu -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s -check-prefix=X86
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s -check-prefix=X64
+
+
+declare void @llvm.va_start(ptr)
+declare void @llvm.va_end(ptr)
+
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
+
+declare void @sink_valist(ptr)
+declare void @sink_i32(i32)
+
+%struct.__va_list_tag = type { i32, i32, ptr, ptr }
+
+;; Simple function is split into two functions
+; X86-LABEL: define internal void @x86_non_inlinable.valist(
+; X86:       entry:
+; X86:       %va = alloca ptr, align 4
+; X86:       call void @sink_i32(i32 0)
+; X86:       store ptr %varargs, ptr %va, align 4
+; X86:       %0 = load ptr, ptr %va, align 4
+; X86:       call void @sink_valist(ptr noundef %0)
+; X86:       ret void
+; X86:     }
+; X86-LABEL: define void @x86_non_inlinable(
+; X86:       entry:
+; X86:       %va_list = alloca ptr, align 4
+; X86:       call void @llvm.va_start.p0(ptr %va_list)
+; X86:       tail call void @x86_non_inlinable.valist(ptr %va_list)
+; X86:       ret void
+; X86:       }
+define void @x86_non_inlinable(...)  {
+entry:
+  %va = alloca ptr, align 4
+  call void @sink_i32(i32 0)
+  call void @llvm.va_start.p0(ptr nonnull %va)
+  %0 = load ptr, ptr %va, align 4
+  call void @sink_valist(ptr noundef %0)
+  ret void
+}
+
+; TODO: This needs checks too
+define void @x86_caller(i32 %x) {
+  call void (...) @x86_non_inlinable(i32 %x)
+  ret void
+}
+
+
+;; As above, but for x64 - the different va_list type means a missing load.
+; X64-LABEL: define internal void @x64_non_inlinable.valist(
+; X64:       entry:
+; X64:       %va = alloca [1 x %struct.__va_list_tag], align 16
+; X64:       call void @sink_i32(i32 0)
+; X64:       call void @llvm.memcpy.inline.p0.p0.i32(ptr %va, ptr %varargs, i32 24, i1 false)
+; X64:       call void @sink_valist(ptr noundef %va)
+; X64:       ret void
+; X64:     }
+; X64-LABEL: define void @x64_non_inlinable(
+; X64:       entry:
+; X64:       %va_list = alloca [1 x { i32, i32, ptr, ptr }], align 8
+; X64:       call void @llvm.va_start.p0(ptr %va_list)
+; X64:       tail call void @x64_non_inlinable.valist(ptr %va_list)
+; X64:       ret void
+; X64:       }
+define void @x64_non_inlinable(...)  {
+entry:
+  %va = alloca [1 x %struct.__va_list_tag], align 16
+  call void @sink_i32(i32 0)
+  call void @llvm.va_start.p0(ptr nonnull %va)
+  call void @sink_valist(ptr noundef %va)
+  ret void
+}
+
+
+; TODO: Is unchanged
+define void @no_known_callers(...)  {
+entry:
+  %va = alloca ptr, align 4
+  call void @sink_i32(i32 0)
+  call void @llvm.va_start.p0(ptr nonnull %va)
+  %0 = load ptr, ptr %va, align 4
+  call void @sink_valist(ptr noundef %0)
+  ret void
+}
+
diff --git a/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-linkage.ll b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-linkage.ll
new file mode 100644
index 00000000000000..9e96bfaccd4c39
--- /dev/null
+++ b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-linkage.ll
@@ -0,0 +1,225 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --function-signature
+; RUN: opt -mtriple=i386-unknown-linux-gnu -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s --check-prefixes=OPT
+; RUN: opt -mtriple=i386-unknown-linux-gnu -S --passes=expand-variadics --expand-variadics-override=lowering < %s | FileCheck %s --check-prefixes=ABI
+
+; Split variadic functions into two functions:
+; - one equivalent to the original, same symbol etc
+; - one implementing the contents of the original but taking a valist
+; IR here is applicable to any target that uses a ptr for valist
+;
+; Defines a function with each linkage (in the order of the llvm documentation).
+; If split applies it does the same transform to each.
+; Whether split applies depends on whether the ABI is being changed or not - e.g. a weak
+; function is not normally useful to split as the contents cannot be called from elsewhere.
+; If the ABI is being rewritten then the function is still converted. Call sites tested elsewhere.
+
+; Update test checks doesn't emit checks for declares
+
+declare void @sink_valist(ptr)
+declare void @llvm.va_start(ptr)
+declare void @llvm.va_end(ptr)
+
+declare void @decl_simple(...)
+define void @defn_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_simple(...) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    %va_list = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; OPT-NEXT:    tail call void @defn_simple.valist(ptr %va_list)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+; no declare for private
+define private void @defn_private_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_private_simple(...) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    %va_list = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; OPT-NEXT:    tail call void @defn_private_simple.valist(ptr %va_list)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_private_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+; no declare for internal
+define internal void @defn_internal_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_internal_simple(...) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    %va_list = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; OPT-NEXT:    tail call void @defn_internal_simple.valist(ptr %va_list)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_internal_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+; no declare for available_externally
+define available_externally void @available_externally_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@available_externally_simple(...) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    %va_list = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; OPT-NEXT:    tail call void @available_externally_simple.valist(ptr %va_list)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@available_externally_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+; no declare for linkonce
+define linkonce void @defn_linkonce_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_linkonce_simple(...) {
+; OPT-NEXT:    %va = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va)
+; OPT-NEXT:    call void @sink_valist(ptr %va)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_linkonce_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+; no declare for weak
+define weak void @defn_weak_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_weak_simple(...) {
+; OPT-NEXT:    %va = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va)
+; OPT-NEXT:    call void @sink_valist(ptr %va)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_weak_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+; common is not applicable to functions
+; appending is not applicable to functions
+
+declare extern_weak void @decl_extern_weak_simple(...)
+; no define for extern_weak
+
+; no declare for linkonce_odr
+define linkonce_odr void @defn_linkonce_odr_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_linkonce_odr_simple(...) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    %va_list = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; OPT-NEXT:    tail call void @defn_linkonce_odr_simple.valist(ptr %va_list)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_linkonce_odr_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+; no declare for weak_odr
+define weak_odr void @defn_weak_odr_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_weak_odr_simple(...) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    %va_list = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; OPT-NEXT:    tail call void @defn_weak_odr_simple.valist(ptr %va_list)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_weak_odr_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+declare external void @decl_external_simple(...)
+define external void @defn_external_simple(...) {
+; OPT-LABEL: define {{[^@]+}}@defn_external_simple(...) {
+; OPT-NEXT:  entry:
+; OPT-NEXT:    %va_list = alloca ptr, align 4
+; OPT-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; OPT-NEXT:    tail call void @defn_external_simple.valist(ptr %va_list)
+; OPT-NEXT:    ret void
+;
+; ABI-LABEL: define {{[^@]+}}@defn_external_simple(ptr noalias %varargs) {
+; ABI-NEXT:    %va = alloca ptr, align 4
+; ABI-NEXT:    store ptr %varargs, ptr %va, align 4
+; ABI-NEXT:    call void @sink_valist(ptr %va)
+; ABI-NEXT:    ret void
+;
+  %va = alloca ptr, align 4
+  call void @llvm.va_start(ptr %va)
+  call void @sink_valist(ptr %va)
+  call void @llvm.va_end(ptr %va)
+  ret void
+}
+
+
+
diff --git a/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll
new file mode 100644
index 00000000000000..2866aade4813bb
--- /dev/null
+++ b/llvm/test/Transforms/ExpandVariadics/expand-va-intrinsic-split-simple.ll
@@ -0,0 +1,121 @@
+; RUN: opt -mtriple=i386-unknown-linux-gnu -S --passes=expand-variadics --expand-variadics-override=optimize < %s | FileCheck %s
+
+; i386 uses a void* for va_arg
+; amdgpu should be the same codegen, nvptx slightly different alignment on the va_arg
+
+; Examples are variadic functions that return the first or the second of an int and a double
+; Split the functions into an internal equivalent that takes a va_list and a ABI preserving wrapper
+
+define i32 @variadic_int_double_get_firstz(...) {
+entry:
+  %va = alloca ptr, align 4
+  call void @llvm.va_start.p0(ptr nonnull %va)
+  %argp.cur = load ptr, ptr %va, align 4
+  %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+  store ptr %argp.next, ptr %va, align 4
+  %0 = load i32, ptr %argp.cur, align 4
+  call void @llvm.va_end.p0(ptr %va)
+  ret i32 %0
+}
+
+; CHECK-LABEL: define internal i32 @variadic_int_double_get_firstz.valist(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:   %va = alloca ptr, align 4
+; CHECK-NEXT:   store ptr %varargs, ptr %va, align 4
+; CHECK-NEXT:   %argp.cur = load ptr, ptr %va, align 4
+; CHECK-NEXT:   %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+; CHECK-NEXT:   store ptr %argp.next, ptr %va, align 4
+; CHECK-NEXT:   %0 = load i32, ptr %argp.cur, align 4
+; CHECK-NEXT:   ret i32 %0
+; CHECK-NEXT:  }
+
+; CHECK-LABEL: define i32 @variadic_int_double_get_firstz(...) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va_list = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; CHECK-NEXT:    %0 = tail call i32 @variadic_int_double_get_firstz.valist(ptr %va_list)
+; CHECK-NEXT:    ret i32 %0
+; CHECK-NEXT:  }
+
+define double @variadic_int_double_get_secondz(...) {
+entry:
+  %va = alloca ptr, align 4
+  call void @llvm.va_start.p0(ptr nonnull %va)
+  %argp.cur = load ptr, ptr %va, align 4
+  %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+  %argp.next2 = getelementptr inbounds i8, ptr %argp.cur, i32 12
+  store ptr %argp.next2, ptr %va, align 4
+  %0 = load double, ptr %argp.next, align 4
+  call void @llvm.va_end.p0(ptr %va)
+  ret double %0
+}
+
+; CHECK-LABEL: define internal double @variadic_int_double_get_secondz.valist(ptr noalias %varargs) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va = alloca ptr, align 4
+; CHECK-NEXT:    store ptr %varargs, ptr %va, align 4
+; CHECK-NEXT:    %argp.cur = load ptr, ptr %va, align 4
+; CHECK-NEXT:    %argp.next = getelementptr inbounds i8, ptr %argp.cur, i32 4
+; CHECK-NEXT:    %argp.next2 = getelementptr inbounds i8, ptr %argp.cur, i32 12
+; CHECK-NEXT:    store ptr %argp.next2, ptr %va, align 4
+; CHECK-NEXT:    %0 = load double, ptr %argp.next, align 4
+; CHECK-NEXT:    ret double %0
+; CHECK-NEXT:  }
+
+; CHECK-LABEL: define double @variadic_int_double_get_secondz(...) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %va_list = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.va_start.p0(ptr %va_list)
+; CHECK-NEXT:    %0 = tail call double @variadic_int_double_get_secondz.valist(ptr %va_list)
+; CHECK-NEXT:    ret double %0
+; CHECK-NEXT:  }
+
+
+; CHECK-LABEL: @variadic_can_get_firstIidEEbT_T0_(i32 %x, double %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %variadic_can_get_firstIidEEbT_T0_.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %variadic_can_get_firstIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %variadic_can_get_firstIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store double %y, ptr %1, align 4
+; CHECK-NEXT:    %call = call i32 @variadic_int_double_get_firstz.valist(ptr %vararg_buffer)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %cmp.i = icmp eq i32 %call, %x
+; CHECK-NEXT:    ret i1 %cmp.i
+; CHECK-NEXT:  }
+
+define zeroext i1 @variadic_can_get_firstIidEEbT_T0_(i32 %x, double %y) {
+entry:
+  %call = call i32 (...) @variadic_int_double_get_firstz(i32 %x, double %y)
+  %cmp.i = icmp eq i32 %call, %x
+  ret i1 %cmp.i
+}
+
+; CHECK-LABEL: @variadic_can_get_secondIidEEbT_T0_(i32 %x, double %y) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    %vararg_buffer = alloca %variadic_can_get_secondIidEEbT_T0_.vararg, align 16
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %0 = getelementptr inbounds %variadic_can_get_secondIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 0
+; CHECK-NEXT:    store i32 %x, ptr %0, align 4
+; CHECK-NEXT:    %1 = getelementptr inbounds %variadic_can_get_secondIidEEbT_T0_.vararg, ptr %vararg_buffer, i32 0, i32 1
+; CHECK-NEXT:    store double %y, ptr %1, align 4
+; CHECK-NEXT:    %call = call double @variadic_int_double_get_secondz.valist(ptr %vararg_buffer)
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr %vararg_buffer)
+; CHECK-NEXT:    %cmp.i = fcmp oeq double %call, %y
+; CHECK-NEXT:    ret i1 %cmp.i
+; CHECK-NEXT:  }
+
+define zeroext i1 @variadic_can_get_secondIidEEbT_T0_(i32 %x, double %y) {
+entry:
+  %call = call double (...) @variadic_int_double_get_secondz(i32 %x, double %y)
+  %cmp.i = fcmp oeq double %call, %y
+  ret i1 %cmp.i
+}
+
+; Declaration unchanged
+; CHECK: declare void @variadic_without_callers(...)
+declare void @variadic_without_callers(...)
+
+declare void @llvm.va_start.p0(ptr)
+declare void @llvm.va_end.p0(ptr)
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn
index 0d134c7bdffb70..bcf2ea7510568d 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/IPO/BUILD.gn
@@ -33,6 +33,7 @@ static_library("IPO") {
     "DeadArgumentElimination.cpp",
     "ElimAvailExtern.cpp",
     "EmbedBitcodePass.cpp",
+    "ExpandVariadics.cpp",
     "ExtractGV.cpp",
     "ForceFunctionAttrs.cpp",
     "FunctionAttrs.cpp",

>From 0245f901d1abd3a0e5b859921585d250868e8e90 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfield at gmail.com>
Date: Wed, 17 Apr 2024 12:15:15 +0100
Subject: [PATCH 2/3] Fix some of Matt's comments

---
 clang/lib/CodeGen/Targets/AMDGPU.cpp          |  6 ++--
 .../llvm/Transforms/IPO/ExpandVariadics.h     |  8 ++---
 llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def |  2 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  2 +-
 llvm/lib/Target/NVPTX/NVPTXPassRegistry.def   |  1 +
 llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp  |  2 +-
 llvm/lib/Transforms/IPO/ExpandVariadics.cpp   | 33 +++++++++----------
 7 files changed, 28 insertions(+), 26 deletions(-)

diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index e274741a6ee652..98bbfae5f3dfc6 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -117,8 +117,10 @@ Address AMDGPUABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
                                  QualType Ty) const {
   const bool IsIndirect = false;
   const bool AllowHigherAlign = true;
-  // Would rather not naturally align values
-  // Splitting {char, short} into two separate arguments makes that difficult.
+  // Would rather not naturally align values, e.g. passing double on 4 bytes
+  // Splitting {char, short} into two separate arguments makes that difficult
+  // TODO: See if forcing exactly four byte alignment on everything, struct or
+  // scalar, lowers correctly regardless of structs being split across arguments
   return emitVoidPtrVAArg(CGF, VAListAddr, Ty, IsIndirect,
                           getContext().getTypeInfoInChars(Ty),
                           CharUnits::fromQuantity(1), AllowHigherAlign);
diff --git a/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h b/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h
index 67fa746813ea0e..64704fa7fc95de 100644
--- a/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h
+++ b/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h
@@ -17,10 +17,10 @@ class ModulePass;
 class OptimizationLevel;
 
 enum class ExpandVariadicsMode {
-  unspecified,
-  disable,
-  optimize,
-  lowering,
+  Unspecified, // Use the implementation defaults
+  Disable, // Disable the pass entirely
+  Optimize, // Optimise without changing ABI
+  Lowering, // Change variadic calling convention
 };
 
 class ExpandVariadicsPass : public PassInfoMixin<ExpandVariadicsPass> {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
index 70d634edc35e11..227ff06b16186a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def
@@ -24,7 +24,7 @@ MODULE_PASS("amdgpu-lower-ctor-dtor", AMDGPUCtorDtorLoweringPass())
 MODULE_PASS("amdgpu-lower-module-lds", AMDGPULowerModuleLDSPass(*this))
 MODULE_PASS("amdgpu-printf-runtime-binding", AMDGPUPrintfRuntimeBindingPass())
 MODULE_PASS("amdgpu-unify-metadata", AMDGPUUnifyMetadataPass())
-MODULE_PASS("expand-variadics", ExpandVariadicsPass(ExpandVariadicsMode::lowering))
+MODULE_PASS("expand-variadics", ExpandVariadicsPass(ExpandVariadicsMode::Lowering))
 #undef MODULE_PASS
 
 #ifndef FUNCTION_PASS
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 99ae8a17134a78..1d7d232ad969f8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -968,7 +968,7 @@ void AMDGPUPassConfig::addIRPasses() {
   if (isPassEnabled(EnableImageIntrinsicOptimizer))
     addPass(createAMDGPUImageIntrinsicOptimizerPass(&TM));
 
-  addPass(createExpandVariadicsPass(ExpandVariadicsMode::lowering));
+  addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
 
   // Function calls are not supported, so make sure we inline everything.
   addPass(createAMDGPUAlwaysInlinePass());
diff --git a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
index 6ff15ab6f13c44..f7456b13413dc7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
+++ b/llvm/lib/Target/NVPTX/NVPTXPassRegistry.def
@@ -18,6 +18,7 @@
 #endif
 MODULE_PASS("generic-to-nvvm", GenericToNVVMPass())
 MODULE_PASS("nvptx-lower-ctor-dtor", NVPTXCtorDtorLoweringPass())
+MODULE_PASS("expand-variadics", ExpandVariadicsPass(ExpandVariadicsMode::Lowering))
 #undef MODULE_PASS
 
 #ifndef FUNCTION_ANALYSIS
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index e5db0114592877..0a68027a74d43e 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -325,7 +325,7 @@ void NVPTXPassConfig::addIRPasses() {
   }));
 
   // Should run before anything (else!) that adjusts calling conventions
-  addPass(createExpandVariadicsPass(ExpandVariadicsMode::lowering));
+  addPass(createExpandVariadicsPass(ExpandVariadicsMode::Lowering));
 
   // NVVMReflectPass is added in addEarlyAsPossiblePasses, so hopefully running
   // it here does nothing.  But since we need it for correctness when lowering
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index e85f91e4060c49..814bdb9a2d21b0 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -53,26 +53,27 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/TargetParser/Triple.h"
 
-#include <cstdio>
-
 #define DEBUG_TYPE "expand-variadics"
 
 using namespace llvm;
 
 cl::opt<ExpandVariadicsMode> ExpandVariadicsModeOption(
     DEBUG_TYPE "-override", cl::desc("Override the behaviour of " DEBUG_TYPE),
-    cl::init(ExpandVariadicsMode::unspecified),
-    cl::values(clEnumValN(ExpandVariadicsMode::unspecified, "unspecified",
+    cl::init(ExpandVariadicsMode::Unspecified),
+    cl::values(clEnumValN(ExpandVariadicsMode::Unspecified, "unspecified",
                           "Use the implementation defaults"),
-               clEnumValN(ExpandVariadicsMode::disable, "disable",
+               clEnumValN(ExpandVariadicsMode::Disable, "disable",
                           "Disable the pass entirely"),
-               clEnumValN(ExpandVariadicsMode::optimize, "optimize",
+               clEnumValN(ExpandVariadicsMode::Optimize, "optimize",
                           "Optimise without changing ABI"),
-               clEnumValN(ExpandVariadicsMode::lowering, "lowering",
+               clEnumValN(ExpandVariadicsMode::Lowering, "lowering",
                           "Change variadic calling convention")));
 
 namespace {
 
+// At present Intrinsic:: has no interface to test if a declaration is in the
+// module without creating one. Inserting a declaration and then testing if it
+// has any uses and then deleting it seems a bad way to do the query.
 // Module implements getFunction() which returns nullptr on missing declaration
 // and getOrInsertFunction which creates one when absent. Intrinsics.h
 // implements getDeclaration which creates one when missing. This should be
@@ -319,7 +320,7 @@ class ExpandVariadics : public ModulePass {
   static ExpandVariadicsMode
   withCommandLineOverride(ExpandVariadicsMode LLVMRequested) {
     ExpandVariadicsMode UserRequested = ExpandVariadicsModeOption;
-    return (UserRequested == ExpandVariadicsMode::unspecified) ? LLVMRequested
+    return (UserRequested == ExpandVariadicsMode::Unspecified) ? LLVMRequested
                                                                : UserRequested;
   }
 
@@ -346,7 +347,7 @@ class ExpandVariadics : public ModulePass {
   // Entry point
   bool runOnModule(Module &M) override;
 
-  bool rewriteABI() { return Mode == ExpandVariadicsMode::lowering; }
+  bool rewriteABI() { return Mode == ExpandVariadicsMode::Lowering; }
 
   void memcpyVAListPointers(const DataLayout &DL, IRBuilder<> &Builder,
                             Value *Dst, Value *Src) {
@@ -550,7 +551,7 @@ class ExpandVariadics : public ModulePass {
 
 bool ExpandVariadics::runOnModule(Module &M) {
   bool Changed = false;
-  if (Mode == ExpandVariadicsMode::disable)
+  if (Mode == ExpandVariadicsMode::Disable)
     return Changed;
 
   llvm::Triple Triple(M.getTargetTriple());
@@ -562,7 +563,7 @@ bool ExpandVariadics::runOnModule(Module &M) {
 
   ABI = VariadicABIInfo::create(Triple);
   if (!ABI) {
-    if (Mode == ExpandVariadicsMode::lowering) {
+    if (Mode == ExpandVariadicsMode::Lowering) {
       report_fatal_error(
           "Requested variadic lowering is unimplemented on this target");
     }
@@ -608,7 +609,7 @@ bool ExpandVariadics::runOnModule(Module &M) {
   // and va_copy are removed. All that remains is for the lowering pass to find
   // indirect calls and rewrite those as well.
 
-  if (Mode == ExpandVariadicsMode::lowering) {
+  if (Mode == ExpandVariadicsMode::Lowering) {
     for (Function &F : llvm::make_early_inc_range(M)) {
       if (F.isDeclaration())
         continue;
@@ -638,8 +639,6 @@ bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder,
                                     Function *F) {
   bool Changed = false;
 
-  // fprintf(stderr, "Called runOn: %s\n", F->getName().str().c_str());
-
   // This check might be too coarse - there are probably cases where
   // splitting a function is bad but it's usable without splitting
   if (!expansionApplicableToFunction(M, F))
@@ -674,7 +673,7 @@ bool ExpandVariadics::runOnFunction(Module &M, IRBuilder<> &Builder,
         Value *calledOperand = CB->getCalledOperand();
         if (F == calledOperand) {
           report_fatal_error(
-              "ExpandVA abi requires eliminating call uses first\n");
+              "ExpandVA abi requires eliminating call uses first");
         }
       }
 
@@ -1049,8 +1048,8 @@ PreservedAnalyses ExpandVariadicsPass::run(Module &M, ModuleAnalysisManager &) {
 
 ExpandVariadicsPass::ExpandVariadicsPass(OptimizationLevel Level)
     : ExpandVariadicsPass(Level == OptimizationLevel::O0
-                              ? ExpandVariadicsMode::disable
-                              : ExpandVariadicsMode::optimize) {}
+                              ? ExpandVariadicsMode::Disable
+                              : ExpandVariadicsMode::Optimize) {}
 
 ExpandVariadicsPass::ExpandVariadicsPass(ExpandVariadicsMode Mode)
     : ConstructedMode(Mode) {}

>From c495d4f71aa4f5d6bd3ec0f1cd9c2195d977ee68 Mon Sep 17 00:00:00 2001
From: Jon Chesterfield <jonathanchesterfield at gmail.com>
Date: Thu, 18 Apr 2024 22:21:05 +0100
Subject: [PATCH 3/3] apply jhuber6's symbol/dependency patch

---
 llvm/include/llvm/Transforms/IPO/ExpandVariadics.h | 3 ---
 llvm/lib/Passes/PassBuilderPipelines.cpp           | 5 ++++-
 llvm/lib/Passes/PassRegistry.def                   | 2 +-
 llvm/lib/Transforms/IPO/ExpandVariadics.cpp        | 5 -----
 4 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h b/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h
index 64704fa7fc95de..92df9647d8d8f4 100644
--- a/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h
+++ b/llvm/include/llvm/Transforms/IPO/ExpandVariadics.h
@@ -30,9 +30,6 @@ class ExpandVariadicsPass : public PassInfoMixin<ExpandVariadicsPass> {
   // Operates under passed mode unless overridden on commandline
   ExpandVariadicsPass(ExpandVariadicsMode ConstructedMode);
 
-  // Chooses disable or optimize based on optimization level
-  ExpandVariadicsPass(OptimizationLevel Level);
-
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
 };
 
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index b1a306e15fa3c0..99788bca278c8a 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1174,8 +1174,11 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
     MPM.addPass(PGOForceFunctionAttrsPass(PGOOpt->ColdOptType));
 
   // ExpandVariadics interacts well with the function inliner.
-  MPM.addPass(ExpandVariadicsPass(Level));
+  MPM.addPass(ExpandVariadicsPass(Level == OptimizationLevel::O0
+                                  ? ExpandVariadicsMode::Disable
+                                  : ExpandVariadicsMode::Optimize));
 
+  
   MPM.addPass(AlwaysInlinerPass(/*InsertLifetimeIntrinsics=*/true));
 
   if (EnableModuleInliner)
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index d7d29383f9846d..07999b3463c7dc 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -59,7 +59,7 @@ MODULE_PASS("dot-callgraph", CallGraphDOTPrinterPass())
 MODULE_PASS("dxil-upgrade", DXILUpgradePass())
 MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass())
 MODULE_PASS("extract-blocks", BlockExtractorPass({}, false))
-MODULE_PASS("expand-variadics", ExpandVariadicsPass(OptimizationLevel::O0))
+MODULE_PASS("expand-variadics", ExpandVariadicsPass(ExpandVariadicsMode::Disable))
 MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
 MODULE_PASS("function-import", FunctionImportPass())
 MODULE_PASS("globalopt", GlobalOptPass())
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index 814bdb9a2d21b0..f188d583f468bd 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -1046,10 +1046,5 @@ PreservedAnalyses ExpandVariadicsPass::run(Module &M, ModuleAnalysisManager &) {
              : PreservedAnalyses::all();
 }
 
-ExpandVariadicsPass::ExpandVariadicsPass(OptimizationLevel Level)
-    : ExpandVariadicsPass(Level == OptimizationLevel::O0
-                              ? ExpandVariadicsMode::Disable
-                              : ExpandVariadicsMode::Optimize) {}
-
 ExpandVariadicsPass::ExpandVariadicsPass(ExpandVariadicsMode Mode)
     : ConstructedMode(Mode) {}



More information about the cfe-commits mailing list