[llvm] 6ee497a - [X86][Regcall] Add an option to respect regcall ABI v.4 in win64&win32

Wed Aug 2 22:58:48 PDT 2023

Author: Bing1 Yu
Date: 2023-08-03T13:58:33+08:00
New Revision: 6ee497aa0b48ad892447f29a90b4e61241949295

URL: https://github.com/llvm/llvm-project/commit/6ee497aa0b48ad892447f29a90b4e61241949295
DIFF: https://github.com/llvm/llvm-project/commit/6ee497aa0b48ad892447f29a90b4e61241949295.diff

LOG: [X86][Regcall] Add an option to respect regcall ABI v.4 in win64&win32

Reviewed By: pengfei

Differential Revision: https://reviews.llvm.org/D155863

Added: 
    clang/test/CodeGen/check-regcall4-moduleflag.c
    clang/test/CodeGen/regcall4.c
    clang/test/CodeGenCXX/regcall4.cpp
    llvm/test/CodeGen/X86/sse-regcall4.ll

Modified: 
    clang/include/clang/Basic/LangOptions.def
    clang/include/clang/Driver/Options.td
    clang/lib/AST/ItaniumMangle.cpp
    clang/lib/AST/Mangle.cpp
    clang/lib/AST/MicrosoftMangle.cpp
    clang/lib/CodeGen/CodeGenModule.cpp
    clang/lib/Driver/ToolChains/Clang.cpp
    clang/test/Driver/cl-cc-flags.c
    llvm/lib/Target/X86/X86CallingConv.td

Removed: 
    


################################################################################
diff  --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 007b3737f83e62..b6bb5e969e130c 100644

--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -429,6 +429,8 @@ LANGOPT(PaddingOnUnsignedFixedPoint, 1, 0,
 
 LANGOPT(RegisterStaticDestructors, 1, 1, "Register C++ static destructors")
 
+LANGOPT(RegCall4, 1, 0, "Set __regcall4 as a default calling convention to respect __regcall ABI v.4")
+
 LANGOPT(MatrixTypes, 1, 0, "Enable or disable the builtin matrix type")
 
 ENUM_LANGOPT(StrictFlexArraysLevel, StrictFlexArraysLevelKind, 2,

diff  --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 9e25a5e0b58a58..296fa1fcc38a02 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4505,6 +4505,9 @@ def no_offload_add_rpath: Flag<["--"], "no-offload-add-rpath">, Flags<[NoArgumen
   Alias<frtlib_add_rpath>;
 def r : Flag<["-"], "r">, Flags<[LinkerInput,NoArgumentUnused]>,
         Group<Link_Group>;
+def regcall4 : Flag<["-"], "regcall4">, Group<m_Group>, Flags<[CC1Option]>,
+  HelpText<"Set __regcall4 as a default calling convention to respect __regcall ABI v.4">,
+  MarshallingInfoFlag<LangOpts<"RegCall4">>;
 def save_temps_EQ : Joined<["-", "--"], "save-temps=">, Flags<[CC1Option, FlangOption, FC1Option, NoXarchOption]>,
   HelpText<"Save intermediate compilation results.">;
 def save_temps : Flag<["-", "--"], "save-temps">, Flags<[FlangOption, FC1Option, NoXarchOption]>,
@@ -7292,6 +7295,8 @@ def _SLASH_Gv : CLFlag<"Gv">,
   HelpText<"Set __vectorcall as a default calling convention">;
 def _SLASH_Gregcall : CLFlag<"Gregcall">,
   HelpText<"Set __regcall as a default calling convention">;
+def _SLASH_Gregcall4 : CLFlag<"Gregcall4">,
+  HelpText<"Set __regcall4 as a default calling convention to respect __regcall ABI v.4">;
 
 // GNU Driver aliases
 

diff  --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 16f0d90451f7ad..153f6dc2e9cf12 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -1688,8 +1688,12 @@ void CXXNameMangler::mangleRegCallName(const IdentifierInfo *II) {
   // <source-name> ::= <positive length number> __regcall3__ <identifier>
   // <number> ::= [n] <non-negative decimal integer>
   // <identifier> ::= <unqualified source code identifier>
-  Out << II->getLength() + sizeof("__regcall3__") - 1 << "__regcall3__"
-      << II->getName();
+  if (getASTContext().getLangOpts().RegCall4)
+    Out << II->getLength() + sizeof("__regcall4__") - 1 << "__regcall4__"
+        << II->getName();
+  else
+    Out << II->getLength() + sizeof("__regcall3__") - 1 << "__regcall3__"
+        << II->getName();
 }
 
 void CXXNameMangler::mangleDeviceStubName(const IdentifierInfo *II) {

diff  --git a/clang/lib/AST/Mangle.cpp b/clang/lib/AST/Mangle.cpp
index 31cdad4c8fdd4e..53af9fc4d51897 100644
--- a/clang/lib/AST/Mangle.cpp
+++ b/clang/lib/AST/Mangle.cpp
@@ -198,8 +198,12 @@ void MangleContext::mangleName(GlobalDecl GD, raw_ostream &Out) {
     Out << '_';
   else if (CC == CCM_Fast)
     Out << '@';
-  else if (CC == CCM_RegCall)
-    Out << "__regcall3__";
+  else if (CC == CCM_RegCall) {
+    if (getASTContext().getLangOpts().RegCall4)
+      Out << "__regcall4__";
+    else
+      Out << "__regcall3__";
+  }
 
   if (!MCXX)
     Out << D->getIdentifier()->getName();

diff  --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 3306d90dc85664..91af18d6119796 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -2853,6 +2853,7 @@ void MicrosoftCXXNameMangler::mangleCallingConvention(CallingConv CC) {
   //                      ::= T # __attribute__((__swiftasynccall__))
   //                            // Clang-only
   //                      ::= w # __regcall
+  //                      ::= x # __regcall4
   // The 'export' calling conventions are from a bygone era
   // (*cough*Win16*cough*) when functions were declared for export with
   // that keyword. (It didn't actually export them, it just made them so
@@ -2873,7 +2874,12 @@ void MicrosoftCXXNameMangler::mangleCallingConvention(CallingConv CC) {
     case CC_Swift: Out << 'S'; break;
     case CC_SwiftAsync: Out << 'W'; break;
     case CC_PreserveMost: Out << 'U'; break;
-    case CC_X86RegCall: Out << 'w'; break;
+    case CC_X86RegCall:
+      if (getASTContext().getLangOpts().RegCall4)
+        Out << "x";
+      else
+        Out << "w";
+      break;
   }
 }
 void MicrosoftCXXNameMangler::mangleCallingConvention(const FunctionType *T) {

diff  --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 8d2abc69c330e7..e7cbc748b7a38b 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1196,6 +1196,8 @@ void CodeGenModule::Release() {
     getModule().setOverrideStackAlignment(getCodeGenOpts().StackAlignment);
   if (getCodeGenOpts().SkipRaxSetup)
     getModule().addModuleFlag(llvm::Module::Override, "SkipRaxSetup", 1);
+  if (getLangOpts().RegCall4)
+    getModule().addModuleFlag(llvm::Module::Override, "RegCallv4", 1);
 
   if (getContext().getTargetInfo().getMaxTLSAlign())
     getModule().addModuleFlag(llvm::Module::Error, "MaxTLSAlign",
@@ -1707,7 +1709,10 @@ static std::string getMangledNameImpl(CodeGenModule &CGM, GlobalDecl GD,
 
     if (FD &&
         FD->getType()->castAs<FunctionType>()->getCallConv() == CC_X86RegCall) {
-      Out << "__regcall3__" << II->getName();
+      if (CGM.getLangOpts().RegCall4)
+        Out << "__regcall4__" << II->getName();
+      else
+        Out << "__regcall3__" << II->getName();
     } else if (FD && FD->hasAttr<CUDAGlobalAttr>() &&
                GD.getKernelReferenceKind() == KernelReferenceKind::Stub) {
       Out << "__device_stub__" << II->getName();

diff  --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index eae3643bd4bf59..c5155735000490 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7936,6 +7936,9 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType,
       CmdArgs.push_back("-fms-memptr-rep=virtual");
   }
 
+  if (Args.hasArg(options::OPT_regcall4))
+    CmdArgs.push_back("-regcall4");
+
   // Parse the default calling convention options.
   if (Arg *CCArg =
           Args.getLastArg(options::OPT__SLASH_Gd, options::OPT__SLASH_Gr,
@@ -7972,6 +7975,9 @@ void Clang::AddClangCLArgs(const ArgList &Args, types::ID InputType,
       CmdArgs.push_back(DCCFlag);
   }
 
+  if (Args.hasArg(options::OPT__SLASH_Gregcall4))
+    CmdArgs.push_back("-regcall4");
+
   Args.AddLastArg(CmdArgs, options::OPT_vtordisp_mode_EQ);
 
   if (!Args.hasArg(options::OPT_fdiagnostics_format_EQ)) {

diff  --git a/clang/test/CodeGen/check-regcall4-moduleflag.c b/clang/test/CodeGen/check-regcall4-moduleflag.c
new file mode 100644
index 00000000000000..0b968e3d19d827
--- /dev/null
+++ b/clang/test/CodeGen/check-regcall4-moduleflag.c
@@ -0,0 +1,7 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm %s -o - | FileCheck %s -check-prefix=NO-REGCALL4
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -regcall4 -emit-llvm %s -o - | FileCheck %s -check-prefix=REGCALL4
+
+void f(void) {}
+
+// REGCALL4: !"RegCallv4", i32 1}
+// NO-REGCALL4-NOT: "RegCallv4"

diff  --git a/clang/test/CodeGen/regcall4.c b/clang/test/CodeGen/regcall4.c
new file mode 100644
index 00000000000000..5fbe77fbc7d769
--- /dev/null
+++ b/clang/test/CodeGen/regcall4.c
@@ -0,0 +1,100 @@
+// RUN: %clang_cc1 -regcall4 -emit-llvm %s -o - -ffreestanding -triple=i386-pc-win32       | FileCheck %s --check-prefixes=X86,Win32
+// RUN: %clang_cc1 -regcall4 -emit-llvm %s -o - -ffreestanding -triple=x86_64-pc-win32     | FileCheck %s --check-prefixes=X64,Win64
+// RUN: %clang_cc1 -regcall4 -emit-llvm %s -o - -ffreestanding -triple=i386-pc-linux-gnu   | FileCheck %s --check-prefixes=X86,Lin32
+// RUN: %clang_cc1 -regcall4 -emit-llvm %s -o - -ffreestanding -triple=x86_64-pc-linux-gnu | FileCheck %s --check-prefixes=X64,Lin64
+
+#include <xmmintrin.h>
+
+void __regcall v1(int a, int b) {}
+// X86: define dso_local x86_regcallcc void @__regcall4__v1(i32 inreg noundef %a, i32 inreg noundef %b)
+// X64: define dso_local x86_regcallcc void @__regcall4__v1(i32 noundef %a, i32 noundef %b)
+
+void __attribute__((regcall)) v1b(int a, int b) {}
+// X86: define dso_local x86_regcallcc void @__regcall4__v1b(i32 inreg noundef %a, i32 inreg noundef %b)
+// X64: define dso_local x86_regcallcc void @__regcall4__v1b(i32 noundef %a, i32 noundef %b)
+
+void __regcall v2(char a, char b) {}
+// X86: define dso_local x86_regcallcc void @__regcall4__v2(i8 inreg noundef signext %a, i8 inreg noundef signext %b)
+// Win64: define dso_local x86_regcallcc void @__regcall4__v2(i8 noundef %a, i8 noundef %b)
+// Lin64: define dso_local x86_regcallcc void @__regcall4__v2(i8 noundef signext %a, i8 noundef signext %b)
+
+struct Small { int x; };
+void __regcall v3(int a, struct Small b, int c) {}
+// Win32: define dso_local x86_regcallcc void @__regcall4__v3(i32 inreg noundef %a, i32 %b.0, i32 inreg noundef %c)
+// Lin32: define dso_local x86_regcallcc void @__regcall4__v3(i32 inreg noundef %a, i32 inreg %0, i32 %b.0, i32 inreg noundef %c)
+// X64: define dso_local x86_regcallcc void @__regcall4__v3(i32 noundef %a, i32 %b.coerce, i32 noundef %c)
+
+struct Large { int a[5]; };
+void __regcall v4(int a, struct Large b, int c) {}
+// Win32: define dso_local x86_regcallcc void @__regcall4__v4(i32 inreg noundef %a, ptr noundef byval(%struct.Large) align 4 %b, i32 inreg noundef %c)
+// Lin32: define dso_local x86_regcallcc void @__regcall4__v4(i32 inreg noundef %a, ptr noundef byval(%struct.Large) align 4 %b, i32 noundef %c)
+// Win64: define dso_local x86_regcallcc void @__regcall4__v4(i32 noundef %a, ptr noundef %b, i32 noundef %c)
+// Lin64: define dso_local x86_regcallcc void @__regcall4__v4(i32 noundef %a, [5 x i32] %b.coerce, i32 noundef %c)
+
+void __regcall v5(long long a, int b, int c) {}
+// X86: define dso_local x86_regcallcc void @__regcall4__v5(i64 noundef %a, i32 inreg noundef %b, i32 inreg noundef %c)
+// X64: define dso_local x86_regcallcc void @__regcall4__v5(i64 noundef %a, i32 noundef %b, i32 noundef %c)
+
+struct HFA2 { double x, y; };
+struct HFA4 { double w, x, y, z; };
+struct HFA5 { double v, w, x, y, z; };
+
+void __regcall hfa1(int a, struct HFA4 b, int c) {}
+// X86: define dso_local x86_regcallcc void @__regcall4__hfa1(i32 inreg noundef %a, double %b.0, double %b.1, double %b.2, double %b.3, i32 inreg noundef %c)
+// X64: define dso_local x86_regcallcc void @__regcall4__hfa1(i32 noundef %a, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, i32 noundef %c)
+
+// HFAs that would require more than six total SSE registers are passed
+// indirectly. Additional vector arguments can consume the rest of the SSE
+// registers.
+void __regcall hfa2(struct HFA4 a, struct HFA4 b, double c) {}
+// X86: define dso_local x86_regcallcc void @__regcall4__hfa2(double %a.0, double %a.1, double %a.2, double %a.3, double %b.0, double %b.1, double %b.2, double %b.3, ptr inreg noundef %0)
+// X64: define dso_local x86_regcallcc void @__regcall4__hfa2(double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double %{{.*}}, double noundef %c)
+
+// Ensure that we pass builtin types directly while counting them against the
+// SSE register usage.
+void __regcall hfa3(double a, double b, double c, double d, double e, struct HFA2 f) {}
+// X86: define dso_local x86_regcallcc void @__regcall4__hfa3(double noundef %a, double noundef %b, double noundef %c, double noundef %d, double noundef %e, double %f.0, double %f.1)
+// X64: define dso_local x86_regcallcc void @__regcall4__hfa3(double noundef %a, double noundef %b, double noundef %c, double noundef %d, double noundef %e, double %{{.*}}, double %{{.*}})
+
+// Aggregates with more than four elements are not HFAs and are passed byval(%b.3, double noundef).
+// Because they are not classified as homogeneous, they don't get special
+// handling to ensure alignment.
+void __regcall hfa4(struct HFA5 a) {}
+// X32: define dso_local x86_regcallcc void @__regcall4__hfa4(ptr noundef byval(%struct.HFA5) align 4 %{{.*}})
+// Win64: define dso_local x86_regcallcc void @__regcall4__hfa4(ptr noundef %a)
+// Lin64: define dso_local x86_regcallcc void @__regcall4__hfa4(double %a.coerce0, double %a.coerce1, double %a.coerce2, double %a.coerce3, double %a.coerce4)
+
+// Return HFAs of 4 or fewer elements in registers.
+static struct HFA2 g_hfa2;
+struct HFA2 __regcall hfa5(void) { return g_hfa2; }
+// X86: define dso_local x86_regcallcc %struct.HFA2 @__regcall4__hfa5()
+// X64: define dso_local x86_regcallcc %struct.HFA2 @__regcall4__hfa5()
+
+typedef float __attribute__((vector_size(16))) v4f32;
+struct HVA2 { v4f32 x, y; };
+struct HVA4 { v4f32 w, x, y, z; };
+
+void __regcall hva1(int a, struct HVA4 b, int c) {}
+// X86: define dso_local x86_regcallcc void @__regcall4__hva1(i32 inreg noundef %a, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, i32 inreg noundef %c)
+// X64: define dso_local x86_regcallcc void @__regcall4__hva1(i32 noundef %a, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 noundef %c)
+
+void __regcall hva2(struct HVA4 a, struct HVA4 b, v4f32 c) {}
+// X86: define dso_local x86_regcallcc void @__regcall4__hva2(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr inreg noundef %0)
+// X64: define dso_local x86_regcallcc void @__regcall4__hva2(<4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> %{{.*}}, <4 x float> noundef %c)
+
+void __regcall hva3(v4f32 a, v4f32 b, v4f32 c, v4f32 d, v4f32 e, struct HVA2 f) {}
+// X86: define dso_local x86_regcallcc void @__regcall4__hva3(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %c, <4 x float> noundef %d, <4 x float> noundef %e, <4 x float> %f.0, <4 x float> %f.1)
+// X64: define dso_local x86_regcallcc void @__regcall4__hva3(<4 x float> noundef %a, <4 x float> noundef %b, <4 x float> noundef %c, <4 x float> noundef %d, <4 x float> noundef %e, <4 x float> %{{.*}}, <4 x float> %{{.*}})
+
+typedef float __attribute__((ext_vector_type(3))) v3f32;
+struct OddSizeHVA { v3f32 x, y; };
+
+void __regcall odd_size_hva(struct OddSizeHVA a) {}
+// X86: define dso_local x86_regcallcc void @__regcall4__odd_size_hva(<3 x float> %a.0, <3 x float> %a.1)
+// X64: define dso_local x86_regcallcc void @__regcall4__odd_size_hva(<3 x float> %{{.*}}, <3 x float> %{{.*}})
+
+struct HFA6 { __m128 f[4]; };
+struct HFA6 __regcall ret_reg_reused(struct HFA6 a, struct HFA6 b, struct HFA6 c, struct HFA6 d){ struct HFA6 h; return h;}
+// X86: define dso_local x86_regcallcc %struct.HFA6 @__regcall4__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, ptr inreg noundef %c, ptr inreg noundef %d)
+// Win64: define dso_local x86_regcallcc %struct.HFA6 @__regcall4__ret_reg_reused(<4 x float> %a.0, <4 x float> %a.1, <4 x float> %a.2, <4 x float> %a.3, <4 x float> %b.0, <4 x float> %b.1, <4 x float> %b.2, <4 x float> %b.3, <4 x float> %c.0, <4 x float> %c.1, <4 x float> %c.2, <4 x float> %c.3, <4 x float> %d.0, <4 x float> %d.1, <4 x float> %d.2, <4 x float> %d.3)
+// Lin64: define dso_local x86_regcallcc %struct.HFA6 @__regcall4__ret_reg_reused([4 x <4 x float>] %a.coerce, [4 x <4 x float>] %b.coerce, [4 x <4 x float>] %c.coerce, [4 x <4 x float>] %d.coerce)

diff  --git a/clang/test/CodeGenCXX/regcall4.cpp b/clang/test/CodeGenCXX/regcall4.cpp
new file mode 100644
index 00000000000000..7c35db36e1053d
--- /dev/null
+++ b/clang/test/CodeGenCXX/regcall4.cpp
@@ -0,0 +1,120 @@
+// RUN: %clang_cc1 -regcall4 -triple x86_64-linux-gnu -emit-llvm -std=c++11     %s -o - | FileCheck -allow-deprecated-dag-overlap -check-prefix=CHECK-LIN -check-prefix=CHECK-LIN64 %s
+// RUN: %clang_cc1 -regcall4 -triple i386-linux-gnu -emit-llvm -std=c++11     %s -o -   | FileCheck -allow-deprecated-dag-overlap -check-prefix=CHECK-LIN -check-prefix=CHECK-LIN32 %s
+// RUN: %clang_cc1 -regcall4 -triple x86_64-windows-msvc -emit-llvm -std=c++11  %s -o - -DWIN_TEST | FileCheck -allow-deprecated-dag-overlap -check-prefix=CHECK-WIN64 %s
+// RUN: %clang_cc1 -regcall4 -triple i386-windows-msvc -emit-llvm -std=c++11  %s -o - -DWIN_TEST   | FileCheck -allow-deprecated-dag-overlap -check-prefix=CHECK-WIN32 %s
+
+int __regcall foo(int i);
+
+int main()
+{
+  int p = 0, _data;
+  auto lambda = [&](int parameter) -> int {
+    _data = foo(parameter);
+    return _data;
+  };
+  return lambda(p);
+}
+// CHECK-LIN: call x86_regcallcc {{.+}} @_Z15__regcall4__foo
+// CHECK-WIN64: call x86_regcallcc {{.+}} @"?foo@@YxHH at Z"
+// CHECK-WIN32: call x86_regcallcc {{.+}} @"?foo@@YxHH at Z"
+
+int __regcall foo (int i){
+  return i;
+}
+// CHECK-LIN: define{{.*}} x86_regcallcc noundef {{.+}}@_Z15__regcall4__foo
+// CHECK-WIN64: define dso_local x86_regcallcc noundef {{.+}}@"?foo@@YxHH at Z"
+// CHECK-WIN32: define dso_local x86_regcallcc noundef {{.+}}@"?foo@@YxHH at Z"
+
+// used to give a body to test_class functions
+static int x = 0;
+class test_class {
+  int a;
+public:
+#ifndef WIN_TEST
+  __regcall
+#endif
+    test_class(){++x;}
+  // CHECK-LIN-DAG: define linkonce_odr x86_regcallcc void @_ZN10test_classC1Ev
+  // CHECK-LIN-DAG: define linkonce_odr x86_regcallcc void @_ZN10test_classC2Ev
+  // Windows ignores calling convention on constructor/destructors.
+  // CHECK-WIN64-DAG: define linkonce_odr dso_local noundef ptr @"??0test_class@@QEAA at XZ"
+  // CHECK-WIN32-DAG: define linkonce_odr dso_local x86_thiscallcc noundef ptr @"??0test_class@@QAE at XZ"
+
+#ifndef WIN_TEST
+  __regcall
+#endif
+  ~test_class(){--x;}
+  // CHECK-LIN-DAG: define linkonce_odr x86_regcallcc void @_ZN10test_classD2Ev
+  // CHECK-LIN-DAG: define linkonce_odr x86_regcallcc void @_ZN10test_classD1Ev
+  // Windows ignores calling convention on constructor/destructors.
+  // CHECK-WIN64-DAG: define linkonce_odr dso_local void @"??1test_class@@QEAA at XZ"
+  // CHECK-WIN32-DAG: define linkonce_odr dso_local x86_thiscallcc void @"??1test_class@@QAE at XZ"
+
+  test_class& __regcall operator+=(const test_class&){
+    return *this;
+  }
+  // CHECK-LIN-DAG: define linkonce_odr x86_regcallcc noundef nonnull align 4 dereferenceable(4) ptr @_ZN10test_classpLERKS_
+  // CHECK-WIN64-DAG: define linkonce_odr dso_local x86_regcallcc noundef nonnull align 4 dereferenceable(4) ptr @"??Ytest_class@@QEAxAEAV0 at AEBV0@@Z"
+  // CHECK-WIN32-DAG: define linkonce_odr dso_local x86_regcallcc noundef nonnull align 4 dereferenceable(4) ptr @"??Ytest_class@@QAxAAV0 at ABV0@@Z"
+  void __regcall do_thing(){}
+  // CHECK-LIN-DAG: define linkonce_odr x86_regcallcc void @_ZN10test_class20__regcall4__do_thingEv
+  // CHECK-WIN64-DAG: define linkonce_odr dso_local x86_regcallcc void @"?do_thing at test_class@@QEAxXXZ"
+  // CHECK-WIN32-DAG: define linkonce_odr dso_local x86_regcallcc void @"?do_thing at test_class@@QAxXXZ"
+
+  template<typename T>
+  void __regcall tempFunc(T i){}
+  // CHECK-LIN-DAG: define linkonce_odr x86_regcallcc void @_ZN10test_class20__regcall4__tempFuncIiEEvT_
+  // CHECK-WIN64-DAG: define linkonce_odr dso_local x86_regcallcc void @"??$freeTempFunc at H@@YxXH at Z"
+  // CHECK-WIN32-DAG: define linkonce_odr dso_local x86_regcallcc void @"??$freeTempFunc at H@@YxXH at Z"
+};
+
+bool __regcall operator ==(const test_class&, const test_class&){ --x; return false;}
+// CHECK-LIN-DAG: define{{.*}} x86_regcallcc noundef zeroext i1 @_ZeqRK10test_classS1_
+// CHECK-WIN64-DAG: define dso_local x86_regcallcc noundef zeroext i1 @"??8 at Yx_NAEBVtest_class@@0 at Z"
+// CHECK-WIN32-DAG: define dso_local x86_regcallcc noundef zeroext i1 @"??8 at Yx_NABVtest_class@@0 at Z"
+
+test_class __regcall operator""_test_class (unsigned long long) { ++x; return test_class{};}
+// CHECK-LIN64-DAG: define{{.*}} x86_regcallcc void @_Zli11_test_classy(ptr noalias sret(%class.test_class) align 4 %agg.result, i64 noundef %0)
+// CHECK-LIN32-DAG: define{{.*}} x86_regcallcc void @_Zli11_test_classy(ptr inreg noalias sret(%class.test_class) align 4 %agg.result, i64 noundef %0)
+// CHECK-WIN64-DAG: ??__K_test_class@@Yx?AVtest_class@@_K at Z"
+// CHECK-WIN32-DAG: ??__K_test_class@@Yx?AVtest_class@@_K at Z"
+
+template<typename T>
+void __regcall freeTempFunc(T i){}
+// CHECK-LIN-DAG: define linkonce_odr x86_regcallcc void @_Z24__regcall4__freeTempFuncIiEvT_
+// CHECK-WIN64-DAG: define linkonce_odr dso_local x86_regcallcc void @"??$freeTempFunc at H@@YxXH at Z"
+// CHECK-WIN32-DAG: define linkonce_odr dso_local x86_regcallcc void @"??$freeTempFunc at H@@YxXH at Z"
+
+// class to force generation of functions
+void force_gen() {
+  test_class t;
+  test_class t2 = 12_test_class;
+  t += t2;
+  auto t3 = 100_test_class;
+  t3.tempFunc(1);
+  freeTempFunc(1);
+  t3.do_thing();
+}
+
+long double _Complex __regcall foo(long double _Complex f) {
+  return f;
+}
+// CHECK-LIN64-DAG: define{{.*}} x86_regcallcc void @_Z15__regcall4__fooCe(ptr noalias sret({ x86_fp80, x86_fp80 }) align 16 %agg.result, ptr noundef byval({ x86_fp80, x86_fp80 }) align 16 %f)
+// CHECK-LIN32-DAG: define{{.*}} x86_regcallcc void @_Z15__regcall4__fooCe(ptr inreg noalias sret({ x86_fp80, x86_fp80 }) align 4 %agg.result, ptr noundef byval({ x86_fp80, x86_fp80 }) align 4 %f)
+// CHECK-WIN64-DAG: define dso_local x86_regcallcc noundef { double, double } @"?foo@@YxU?$_Complex at O@__clang@@U12@@Z"(double noundef %f.0, double noundef %f.1)
+// CHECK-WIN32-DAG: define dso_local x86_regcallcc noundef { double, double } @"?foo@@YxU?$_Complex at O@__clang@@U12@@Z"(double noundef %f.0, double noundef %f.1)
+
+// The following caused us to dereference uninitialized memory. The long name
+// seems necessary, as does the return types.
+float _Complex __regcall callee(float _Complex f);
+// CHECK-LIN64-DAG: declare x86_regcallcc noundef <2 x float> @_Z18__regcall4__calleeCf(<2 x float> noundef)
+// CHECK-LIN32-DAG: declare x86_regcallcc noundef { float, float } @_Z18__regcall4__calleeCf(float noundef, float noundef)
+// CHECK-WIN64-DAG: declare dso_local x86_regcallcc noundef { float, float } @"?callee@@YxU?$_Complex at M@__clang@@U12@@Z"(float noundef, float noundef)
+// CHECK-WIN32-DAG: declare dso_local x86_regcallcc noundef { float, float } @"?callee@@YxU?$_Complex at M@__clang@@U12@@Z"(float noundef, float noundef)
+
+__regcall int
+some_really_long_name_that_manages_to_hit_the_right_spot_of_mem(int a) {
+  float _Complex x[2];
+  x[0] = callee(x[0]);
+  return a;
+}

diff  --git a/clang/test/Driver/cl-cc-flags.c b/clang/test/Driver/cl-cc-flags.c
index 6fa0b6bd8e92f6..eacaee2c276978 100644
--- a/clang/test/Driver/cl-cc-flags.c
+++ b/clang/test/Driver/cl-cc-flags.c
@@ -16,6 +16,10 @@
 // RUN: %clang_cl --target=i686-windows-msvc /Gregcall -### -- %s 2>&1 | FileCheck --check-prefix=REGCALL %s
 // REGCALL: -fdefault-calling-conv=regcall
 
+// RUN: %clang_cl --target=i686-windows-msvc /Gregcall /Gregcall4 -### -- %s 2>&1 | FileCheck --check-prefix=REGCALL4 %s
+// REGCALL4: -fdefault-calling-conv=regcall
+// REGCALL4: -regcall4
+
 // Last one should win:
 
 // RUN: %clang_cl --target=i686-windows-msvc /Gd /Gv -### -- %s 2>&1 | FileCheck --check-prefix=LASTWINS_VECTOR %s

diff  --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 06cebdc2159438..3ce59dc4aa61bd 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -23,6 +23,11 @@ class CCIfNotSubtarget<string F, CCAction A>
                        "(State.getMachineFunction().getSubtarget()).", F),
            A>;
 
+/// CCIfRegCallv4 - Match if RegCall ABIv4 is respected.
+class CCIfRegCallv4<CCAction A>
+    : CCIf<"State.getMachineFunction().getFunction().getParent()->getModuleFlag(\"RegCallv4\")!=nullptr",
+           A>;
+
 /// CCIfIsVarArgOnWin - Match if isVarArg on Windows 32bits.
 class CCIfIsVarArgOnWin<CCAction A>
     : CCIf<"State.isVarArg() && "
@@ -55,6 +60,20 @@ def RC_X86_32_RegCall : RC_X86_RegCall {
   let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7];
 }
 
+// RegCall register classes for 32 bits if it respect regcall ABI v.4
+// Change in __regcall ABI v.4: don't use EAX as a spare register is
+// needed to code virtual call thunk,
+def RC_X86_32_RegCallv4_Win : RC_X86_RegCall {
+  let GPR_8 = [CL, DL, DIL, SIL];
+  let GPR_16 = [CX, DX, DI, SI];
+  let GPR_32 = [ECX, EDX, EDI, ESI];
+  let GPR_64 = [RAX]; ///< Not actually used, but AssignToReg can't handle []
+                      ///< \todo Fix AssignToReg to enable empty lists
+  let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7];
+  let YMM = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7];
+  let ZMM = [ZMM0, ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6, ZMM7];
+}
+
 class RC_X86_64_RegCall : RC_X86_RegCall {
   let XMM = [XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
              XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15];
@@ -71,6 +90,18 @@ def RC_X86_64_RegCall_Win : RC_X86_64_RegCall {
   let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R10, R11, R12, R14, R15];
 }
 
+// On Windows 64 we don't want to use R13 - it is reserved for
+// largely aligned stack.
+// Change in __regcall ABI v.4: additionally don't use R10 as a
+// a spare register is needed to code virtual call thunk.
+//
+def RC_X86_64_RegCallv4_Win : RC_X86_64_RegCall {
+  let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R11B, R12B, R14B, R15B];
+  let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R11W, R12W, R14W, R15W];
+  let GPR_32 = [EAX, ECX, EDX, EDI, ESI, R8D, R9D, R11D, R12D, R14D, R15D];
+  let GPR_64 = [RAX, RCX, RDX, RDI, RSI, R8, R9, R11, R12, R14, R15];
+}
+
 def RC_X86_64_RegCall_SysV : RC_X86_64_RegCall {
   let GPR_8 = [AL, CL, DL, DIL, SIL, R8B, R9B, R12B, R13B, R14B, R15B];
   let GPR_16 = [AX, CX, DX, DI, SI, R8W, R9W, R12W, R13W, R14W, R15W];
@@ -433,8 +464,12 @@ def RetCC_X86_64_AnyReg : CallingConv<[
 
 defm X86_32_RegCall :
 	 X86_RegCall_base<RC_X86_32_RegCall>;
+defm X86_32_RegCallv4_Win :
+	 X86_RegCall_base<RC_X86_32_RegCallv4_Win>; 
 defm X86_Win64_RegCall :
      X86_RegCall_base<RC_X86_64_RegCall_Win>;
+defm X86_Win64_RegCallv4 :
+     X86_RegCall_base<RC_X86_64_RegCallv4_Win>;
 defm X86_SysV64_RegCall :
      X86_RegCall_base<RC_X86_64_RegCall_SysV>;
 
@@ -447,6 +482,8 @@ def RetCC_X86_32 : CallingConv<[
   // If HiPE, use RetCC_X86_32_HiPE.
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
   CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
+  CCIfCC<"CallingConv::X86_RegCall",
+    CCIfSubtarget<"isTargetWin32()", CCIfRegCallv4<CCDelegateTo<RetCC_X86_32_RegCallv4_Win>>>>,
   CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<RetCC_X86_32_RegCall>>,
 
   // Otherwise, use RetCC_X86_32_C.
@@ -473,6 +510,9 @@ def RetCC_X86_64 : CallingConv<[
   // Handle Vectorcall CC
   CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_64_Vectorcall>>,
 
+  CCIfCC<"CallingConv::X86_RegCall",
+    CCIfSubtarget<"isTargetWin64()", CCIfRegCallv4<CCDelegateTo<RetCC_X86_Win64_RegCallv4>>>>,
+
   CCIfCC<"CallingConv::X86_RegCall",
           CCIfSubtarget<"isTargetWin64()",
                         CCDelegateTo<RetCC_X86_Win64_RegCall>>>,
@@ -1052,6 +1092,8 @@ def CC_X86_32 : CallingConv<[
   CCIfCC<"CallingConv::Tail", CCDelegateTo<CC_X86_32_FastCC>>,
   CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
   CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_32_HiPE>>,
+  CCIfCC<"CallingConv::X86_RegCall",
+    CCIfSubtarget<"isTargetWin32()", CCIfRegCallv4<CCDelegateTo<CC_X86_32_RegCallv4_Win>>>>,
   CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_32_RegCall>>,
 
   // Otherwise, drop to normal X86-32 CC
@@ -1067,6 +1109,8 @@ def CC_X86_64 : CallingConv<[
   CCIfCC<"CallingConv::Win64", CCDelegateTo<CC_X86_Win64_C>>,
   CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>,
   CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>,
+  CCIfCC<"CallingConv::X86_RegCall",
+    CCIfSubtarget<"isTargetWin64()", CCIfRegCallv4<CCDelegateTo<CC_X86_Win64_RegCallv4>>>>,
   CCIfCC<"CallingConv::X86_RegCall",
     CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_RegCall>>>,
   CCIfCC<"CallingConv::X86_RegCall", CCDelegateTo<CC_X86_SysV64_RegCall>>,

diff  --git a/llvm/test/CodeGen/X86/sse-regcall4.ll b/llvm/test/CodeGen/X86/sse-regcall4.ll
new file mode 100644
index 00000000000000..80eaf0f9000668
--- /dev/null
+++ b/llvm/test/CodeGen/X86/sse-regcall4.ll
@@ -0,0 +1,467 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-pc-win32 -mattr=+sse | FileCheck --check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mattr=+sse | FileCheck --check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=+sse | FileCheck --check-prefix=LINUXOSX %s
+
+; Test regcall when receiving/returning i1
+define x86_regcallcc i1 @test_argReti1(i1 %a)  {
+; WIN32-LABEL: test_argReti1:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    incb %cl
+; WIN32-NEXT:    # kill: def $cl killed $cl killed $ecx
+; WIN32-NEXT:    retl
+;
+; WIN64-LABEL: test_argReti1:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    incb %al
+; WIN64-NEXT:    # kill: def $al killed $al killed $eax
+; WIN64-NEXT:    retq
+;
+; LINUXOSX-LABEL: test_argReti1:
+; LINUXOSX:       # %bb.0:
+; LINUXOSX-NEXT:    incb %al
+; LINUXOSX-NEXT:    # kill: def $al killed $al killed $eax
+; LINUXOSX-NEXT:    retq
+  %add = add i1 %a, 1
+  ret i1 %add
+}
+
+; Test regcall when passing/retrieving i1
+define x86_regcallcc i1 @test_CallargReti1(i1 %a)  {
+; WIN32-LABEL: test_CallargReti1:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    incb %cl
+; WIN32-NEXT:    movzbl %cl, %ecx
+; WIN32-NEXT:    calll _test_argReti1
+; WIN32-NEXT:    incb %cl
+; WIN32-NEXT:    retl
+;
+; WIN64-LABEL: test_CallargReti1:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    pushq %rax
+; WIN64-NEXT:    .seh_stackalloc 8
+; WIN64-NEXT:    .seh_endprologue
+; WIN64-NEXT:    incb %al
+; WIN64-NEXT:    movzbl %al, %eax
+; WIN64-NEXT:    callq test_argReti1
+; WIN64-NEXT:    incb %al
+; WIN64-NEXT:    popq %rcx
+; WIN64-NEXT:    retq
+; WIN64-NEXT:    .seh_endproc
+;
+; LINUXOSX-LABEL: test_CallargReti1:
+; LINUXOSX:       # %bb.0:
+; LINUXOSX-NEXT:    pushq %rax
+; LINUXOSX-NEXT:    .cfi_def_cfa_offset 16
+; LINUXOSX-NEXT:    incb %al
+; LINUXOSX-NEXT:    movzbl %al, %eax
+; LINUXOSX-NEXT:    callq *test_argReti1 at GOTPCREL(%rip)
+; LINUXOSX-NEXT:    incb %al
+; LINUXOSX-NEXT:    popq %rcx
+; LINUXOSX-NEXT:    .cfi_def_cfa_offset 8
+; LINUXOSX-NEXT:    retq
+  %b = add i1 %a, 1
+  %c = call x86_regcallcc i1 @test_argReti1(i1 %b)
+  %d = add i1 %c, 1
+  ret i1 %d
+}
+
+;test calling conventions - input parameters, callee saved xmms
+define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b, <16 x float> %c) nounwind {
+; WIN32-LABEL: testf32_inp:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebp
+; WIN32-NEXT:    movl %esp, %ebp
+; WIN32-NEXT:    andl $-16, %esp
+; WIN32-NEXT:    subl $32, %esp
+; WIN32-NEXT:    movaps %xmm7, (%esp) # 16-byte Spill
+; WIN32-NEXT:    movaps %xmm6, %xmm7
+; WIN32-NEXT:    movaps %xmm5, %xmm6
+; WIN32-NEXT:    movaps %xmm3, %xmm5
+; WIN32-NEXT:    movaps %xmm2, %xmm3
+; WIN32-NEXT:    movaps %xmm1, %xmm2
+; WIN32-NEXT:    movaps %xmm0, %xmm1
+; WIN32-NEXT:    addps %xmm4, %xmm0
+; WIN32-NEXT:    mulps %xmm4, %xmm1
+; WIN32-NEXT:    subps %xmm1, %xmm0
+; WIN32-NEXT:    movups 8(%ebp), %xmm1
+; WIN32-NEXT:    addps %xmm1, %xmm0
+; WIN32-NEXT:    movaps %xmm2, %xmm4
+; WIN32-NEXT:    addps %xmm6, %xmm4
+; WIN32-NEXT:    mulps %xmm6, %xmm2
+; WIN32-NEXT:    subps %xmm2, %xmm4
+; WIN32-NEXT:    movups 24(%ebp), %xmm1
+; WIN32-NEXT:    addps %xmm1, %xmm4
+; WIN32-NEXT:    movaps %xmm3, %xmm2
+; WIN32-NEXT:    addps %xmm7, %xmm2
+; WIN32-NEXT:    mulps %xmm7, %xmm3
+; WIN32-NEXT:    subps %xmm3, %xmm2
+; WIN32-NEXT:    movups 40(%ebp), %xmm1
+; WIN32-NEXT:    addps %xmm1, %xmm2
+; WIN32-NEXT:    movaps %xmm5, %xmm3
+; WIN32-NEXT:    movaps (%esp), %xmm1 # 16-byte Reload
+; WIN32-NEXT:    addps %xmm1, %xmm3
+; WIN32-NEXT:    mulps %xmm1, %xmm5
+; WIN32-NEXT:    subps %xmm5, %xmm3
+; WIN32-NEXT:    movups 56(%ebp), %xmm1
+; WIN32-NEXT:    addps %xmm1, %xmm3
+; WIN32-NEXT:    movaps %xmm4, %xmm1
+; WIN32-NEXT:    movl %ebp, %esp
+; WIN32-NEXT:    popl %ebp
+; WIN32-NEXT:    retl
+;
+; WIN64-LABEL: testf32_inp:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    subq $72, %rsp
+; WIN64-NEXT:    movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; WIN64-NEXT:    movaps %xmm12, (%rsp) # 16-byte Spill
+; WIN64-NEXT:    movaps %xmm0, %xmm12
+; WIN64-NEXT:    addps %xmm4, %xmm12
+; WIN64-NEXT:    movaps %xmm1, %xmm13
+; WIN64-NEXT:    addps %xmm5, %xmm13
+; WIN64-NEXT:    movaps %xmm2, %xmm14
+; WIN64-NEXT:    addps %xmm6, %xmm14
+; WIN64-NEXT:    movaps %xmm3, %xmm15
+; WIN64-NEXT:    addps %xmm7, %xmm15
+; WIN64-NEXT:    mulps %xmm4, %xmm0
+; WIN64-NEXT:    subps %xmm0, %xmm12
+; WIN64-NEXT:    mulps %xmm5, %xmm1
+; WIN64-NEXT:    subps %xmm1, %xmm13
+; WIN64-NEXT:    mulps %xmm6, %xmm2
+; WIN64-NEXT:    subps %xmm2, %xmm14
+; WIN64-NEXT:    mulps %xmm7, %xmm3
+; WIN64-NEXT:    subps %xmm3, %xmm15
+; WIN64-NEXT:    addps %xmm8, %xmm12
+; WIN64-NEXT:    addps %xmm9, %xmm13
+; WIN64-NEXT:    addps %xmm10, %xmm14
+; WIN64-NEXT:    addps %xmm11, %xmm15
+; WIN64-NEXT:    movaps %xmm12, %xmm0
+; WIN64-NEXT:    movaps %xmm13, %xmm1
+; WIN64-NEXT:    movaps %xmm14, %xmm2
+; WIN64-NEXT:    movaps %xmm15, %xmm3
+; WIN64-NEXT:    movaps (%rsp), %xmm12 # 16-byte Reload
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; WIN64-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; WIN64-NEXT:    addq $72, %rsp
+; WIN64-NEXT:    retq
+;
+; LINUXOSX-LABEL: testf32_inp:
+; LINUXOSX:       # %bb.0:
+; LINUXOSX-NEXT:    movaps %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; LINUXOSX-NEXT:    movaps %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; LINUXOSX-NEXT:    movaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; LINUXOSX-NEXT:    movaps %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; LINUXOSX-NEXT:    movaps %xmm0, %xmm12
+; LINUXOSX-NEXT:    addps %xmm4, %xmm12
+; LINUXOSX-NEXT:    movaps %xmm1, %xmm13
+; LINUXOSX-NEXT:    addps %xmm5, %xmm13
+; LINUXOSX-NEXT:    movaps %xmm2, %xmm14
+; LINUXOSX-NEXT:    addps %xmm6, %xmm14
+; LINUXOSX-NEXT:    movaps %xmm3, %xmm15
+; LINUXOSX-NEXT:    addps %xmm7, %xmm15
+; LINUXOSX-NEXT:    mulps %xmm4, %xmm0
+; LINUXOSX-NEXT:    subps %xmm0, %xmm12
+; LINUXOSX-NEXT:    mulps %xmm5, %xmm1
+; LINUXOSX-NEXT:    subps %xmm1, %xmm13
+; LINUXOSX-NEXT:    mulps %xmm6, %xmm2
+; LINUXOSX-NEXT:    subps %xmm2, %xmm14
+; LINUXOSX-NEXT:    mulps %xmm7, %xmm3
+; LINUXOSX-NEXT:    subps %xmm3, %xmm15
+; LINUXOSX-NEXT:    addps %xmm8, %xmm12
+; LINUXOSX-NEXT:    addps %xmm9, %xmm13
+; LINUXOSX-NEXT:    addps %xmm10, %xmm14
+; LINUXOSX-NEXT:    addps %xmm11, %xmm15
+; LINUXOSX-NEXT:    movaps %xmm12, %xmm0
+; LINUXOSX-NEXT:    movaps %xmm13, %xmm1
+; LINUXOSX-NEXT:    movaps %xmm14, %xmm2
+; LINUXOSX-NEXT:    movaps %xmm15, %xmm3
+; LINUXOSX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; LINUXOSX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; LINUXOSX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; LINUXOSX-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; LINUXOSX-NEXT:    retq
+  %x1 = fadd <16 x float> %a, %b
+  %x2 = fmul <16 x float> %a, %b
+  %x3 = fsub <16 x float> %x1, %x2
+  %x4 = fadd <16 x float> %x3, %c
+  ret <16 x float> %x4
+}
+
+;test calling conventions - input parameters, callee saved GPRs
+define x86_regcallcc i32 @testi32_inp(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6,
+; WIN32-LABEL: testi32_inp:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebp
+; WIN32-NEXT:    pushl %ebx
+; WIN32-NEXT:    subl $8, %esp
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    movl %edi, %esi
+; WIN32-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; WIN32-NEXT:    movl %ecx, %edi
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    leal (%esi,%eax), %ecx
+; WIN32-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT:    movl %esi, %ecx
+; WIN32-NEXT:    subl %eax, %ecx
+; WIN32-NEXT:    movl %edi, %eax
+; WIN32-NEXT:    subl %edx, %eax
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    imull %eax, %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    movl %esi, %eax
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull %ecx, %eax
+; WIN32-NEXT:    addl %ebx, %eax
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl %ebp, %ebx
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    movl %edx, %ecx
+; WIN32-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT:    imull %ebx, %ecx
+; WIN32-NEXT:    addl %eax, %ecx
+; WIN32-NEXT:    addl (%esp), %edi # 4-byte Folded Reload
+; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT:    imull %eax, %edi
+; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT:    imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; WIN32-NEXT:    addl %esi, %edi
+; WIN32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT:    imull %ebp, %edx
+; WIN32-NEXT:    addl %edx, %edi
+; WIN32-NEXT:    addl %ecx, %edi
+; WIN32-NEXT:    movl %edi, %ecx
+; WIN32-NEXT:    addl $8, %esp
+; WIN32-NEXT:    popl %ebx
+; WIN32-NEXT:    popl %ebp
+; WIN32-NEXT:    retl
+;
+; WIN64-LABEL: testi32_inp:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    pushq %rbx
+; WIN64-NEXT:    # kill: def $edx killed $edx def $rdx
+; WIN64-NEXT:    # kill: def $esi killed $esi def $rsi
+; WIN64-NEXT:    # kill: def $r14d killed $r14d def $r14
+; WIN64-NEXT:    # kill: def $r12d killed $r12d def $r12
+; WIN64-NEXT:    # kill: def $r11d killed $r11d def $r11
+; WIN64-NEXT:    # kill: def $r9d killed $r9d def $r9
+; WIN64-NEXT:    # kill: def $r8d killed $r8d def $r8
+; WIN64-NEXT:    # kill: def $edi killed $edi def $rdi
+; WIN64-NEXT:    leal (%rdx,%rdi), %ebx
+; WIN64-NEXT:    # kill: def $edx killed $edx killed $rdx
+; WIN64-NEXT:    subl %edi, %edx
+; WIN64-NEXT:    leal (%rsi,%r8), %edi
+; WIN64-NEXT:    # kill: def $esi killed $esi killed $rsi
+; WIN64-NEXT:    subl %r8d, %esi
+; WIN64-NEXT:    leal (%r9,%r11), %r8d
+; WIN64-NEXT:    # kill: def $r9d killed $r9d killed $r9
+; WIN64-NEXT:    subl %r11d, %r9d
+; WIN64-NEXT:    movl %eax, %r11d
+; WIN64-NEXT:    subl %ecx, %r11d
+; WIN64-NEXT:    imull %r11d, %r9d
+; WIN64-NEXT:    leal (%r12,%r14), %r11d
+; WIN64-NEXT:    # kill: def $r12d killed $r12d killed $r12
+; WIN64-NEXT:    subl %r14d, %r12d
+; WIN64-NEXT:    imull %edx, %r12d
+; WIN64-NEXT:    movl {{[0-9]+}}(%rsp), %edx
+; WIN64-NEXT:    addl %r9d, %r12d
+; WIN64-NEXT:    movl %r15d, %r9d
+; WIN64-NEXT:    subl %edx, %r9d
+; WIN64-NEXT:    imull %esi, %r9d
+; WIN64-NEXT:    addl %r12d, %r9d
+; WIN64-NEXT:    addl %ecx, %eax
+; WIN64-NEXT:    imull %r8d, %eax
+; WIN64-NEXT:    imull %ebx, %r11d
+; WIN64-NEXT:    addl %r11d, %eax
+; WIN64-NEXT:    addl %r15d, %edx
+; WIN64-NEXT:    imull %edi, %edx
+; WIN64-NEXT:    addl %edx, %eax
+; WIN64-NEXT:    addl %r9d, %eax
+; WIN64-NEXT:    popq %rbx
+; WIN64-NEXT:    retq
+;
+; LINUXOSX-LABEL: testi32_inp:
+; LINUXOSX:       # %bb.0:
+; LINUXOSX-NEXT:    # kill: def $edx killed $edx def $rdx
+; LINUXOSX-NEXT:    # kill: def $esi killed $esi def $rsi
+; LINUXOSX-NEXT:    # kill: def $r14d killed $r14d def $r14
+; LINUXOSX-NEXT:    # kill: def $r13d killed $r13d def $r13
+; LINUXOSX-NEXT:    # kill: def $r12d killed $r12d def $r12
+; LINUXOSX-NEXT:    # kill: def $r9d killed $r9d def $r9
+; LINUXOSX-NEXT:    # kill: def $r8d killed $r8d def $r8
+; LINUXOSX-NEXT:    # kill: def $edi killed $edi def $rdi
+; LINUXOSX-NEXT:    leal (%rdx,%rdi), %r10d
+; LINUXOSX-NEXT:    # kill: def $edx killed $edx killed $rdx
+; LINUXOSX-NEXT:    subl %edi, %edx
+; LINUXOSX-NEXT:    leal (%rsi,%r8), %edi
+; LINUXOSX-NEXT:    # kill: def $esi killed $esi killed $rsi
+; LINUXOSX-NEXT:    subl %r8d, %esi
+; LINUXOSX-NEXT:    leal (%r9,%r12), %r8d
+; LINUXOSX-NEXT:    # kill: def $r9d killed $r9d killed $r9
+; LINUXOSX-NEXT:    subl %r12d, %r9d
+; LINUXOSX-NEXT:    movl %eax, %r11d
+; LINUXOSX-NEXT:    subl %ecx, %r11d
+; LINUXOSX-NEXT:    imull %r11d, %r9d
+; LINUXOSX-NEXT:    leal (%r13,%r14), %r11d
+; LINUXOSX-NEXT:    movl %r13d, %r12d
+; LINUXOSX-NEXT:    subl %r14d, %r12d
+; LINUXOSX-NEXT:    imull %edx, %r12d
+; LINUXOSX-NEXT:    movl {{[0-9]+}}(%rsp), %edx
+; LINUXOSX-NEXT:    addl %r9d, %r12d
+; LINUXOSX-NEXT:    movl %r15d, %r9d
+; LINUXOSX-NEXT:    subl %edx, %r9d
+; LINUXOSX-NEXT:    imull %esi, %r9d
+; LINUXOSX-NEXT:    addl %r12d, %r9d
+; LINUXOSX-NEXT:    addl %ecx, %eax
+; LINUXOSX-NEXT:    imull %r8d, %eax
+; LINUXOSX-NEXT:    imull %r10d, %r11d
+; LINUXOSX-NEXT:    addl %r11d, %eax
+; LINUXOSX-NEXT:    addl %r15d, %edx
+; LINUXOSX-NEXT:    imull %edi, %edx
+; LINUXOSX-NEXT:    addl %edx, %eax
+; LINUXOSX-NEXT:    addl %r9d, %eax
+; LINUXOSX-NEXT:    retq
+                                      i32 %b1, i32 %b2, i32 %b3, i32 %b4, i32 %b5, i32 %b6) nounwind {
+  %x1 = sub i32 %a1, %a2
+  %x2 = sub i32 %a3, %a4
+  %x3 = sub i32 %a5, %a6
+  %y1 = sub i32 %b1, %b2
+  %y2 = sub i32 %b3, %b4
+  %y3 = sub i32 %b5, %b6
+  %v1 = add i32 %a1, %a2
+  %v2 = add i32 %a3, %a4
+  %v3 = add i32 %a5, %a6
+  %w1 = add i32 %b1, %b2
+  %w2 = add i32 %b3, %b4
+  %w3 = add i32 %b5, %b6
+  %s1 = mul i32 %x1, %y1
+  %s2 = mul i32 %x2, %y2
+  %s3 = mul i32 %x3, %y3
+  %t1 = mul i32 %v1, %w1
+  %t2 = mul i32 %v2, %w2
+  %t3 = mul i32 %v3, %w3
+  %m1 = add i32 %s1, %s2
+  %m2 = add i32 %m1, %s3
+  %n1 = add i32 %t1, %t2
+  %n2 = add i32 %n1, %t3
+  %r1 = add i32 %m2, %n2
+  ret i32 %r1
+}
+
+; Test that parameters, overflowing register capacity, are passed through the stack
+define x86_regcallcc <32 x float> @testf32_stack(<32 x float> %a, <32 x float> %b, <32 x float> %c) nounwind {
+; WIN32-LABEL: testf32_stack:
+; WIN32:       # %bb.0:
+; WIN32-NEXT:    pushl %ebp
+; WIN32-NEXT:    movl %esp, %ebp
+; WIN32-NEXT:    andl $-16, %esp
+; WIN32-NEXT:    subl $48, %esp
+; WIN32-NEXT:    movaps %xmm7, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; WIN32-NEXT:    movaps %xmm6, (%esp) # 16-byte Spill
+; WIN32-NEXT:    movaps %xmm5, %xmm6
+; WIN32-NEXT:    movaps %xmm4, %xmm5
+; WIN32-NEXT:    movaps %xmm3, %xmm4
+; WIN32-NEXT:    movaps %xmm2, %xmm3
+; WIN32-NEXT:    movaps %xmm1, %xmm2
+; WIN32-NEXT:    movaps %xmm0, %xmm1
+; WIN32-NEXT:    movups 120(%ebp), %xmm7
+; WIN32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; WIN32-NEXT:    addps %xmm7, %xmm0
+; WIN32-NEXT:    movups 248(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm0
+; WIN32-NEXT:    movaps %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; WIN32-NEXT:    movups 104(%ebp), %xmm7
+; WIN32-NEXT:    movaps (%esp), %xmm0 # 16-byte Reload
+; WIN32-NEXT:    addps %xmm7, %xmm0
+; WIN32-NEXT:    movups 232(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm0
+; WIN32-NEXT:    movaps %xmm0, (%esp) # 16-byte Spill
+; WIN32-NEXT:    movups 88(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm6
+; WIN32-NEXT:    movups 216(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm6
+; WIN32-NEXT:    movups 72(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm5
+; WIN32-NEXT:    movups 200(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm5
+; WIN32-NEXT:    movups 56(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm4
+; WIN32-NEXT:    movups 184(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm4
+; WIN32-NEXT:    movups 40(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm3
+; WIN32-NEXT:    movups 168(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm3
+; WIN32-NEXT:    movups 24(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm2
+; WIN32-NEXT:    movups 152(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm2
+; WIN32-NEXT:    movups 8(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm1
+; WIN32-NEXT:    movups 136(%ebp), %xmm7
+; WIN32-NEXT:    addps %xmm7, %xmm1
+; WIN32-NEXT:    movaps %xmm1, %xmm0
+; WIN32-NEXT:    movaps %xmm2, %xmm1
+; WIN32-NEXT:    movaps %xmm3, %xmm2
+; WIN32-NEXT:    movaps %xmm4, %xmm3
+; WIN32-NEXT:    movaps %xmm5, %xmm4
+; WIN32-NEXT:    movaps %xmm6, %xmm5
+; WIN32-NEXT:    movaps (%esp), %xmm6 # 16-byte Reload
+; WIN32-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm7 # 16-byte Reload
+; WIN32-NEXT:    movl %ebp, %esp
+; WIN32-NEXT:    popl %ebp
+; WIN32-NEXT:    retl
+;
+; WIN64-LABEL: testf32_stack:
+; WIN64:       # %bb.0:
+; WIN64-NEXT:    pushq %rax
+; WIN64-NEXT:    addps %xmm15, %xmm7
+; WIN64-NEXT:    addps %xmm14, %xmm6
+; WIN64-NEXT:    addps %xmm13, %xmm5
+; WIN64-NEXT:    addps %xmm12, %xmm4
+; WIN64-NEXT:    addps %xmm11, %xmm3
+; WIN64-NEXT:    addps %xmm10, %xmm2
+; WIN64-NEXT:    addps %xmm9, %xmm1
+; WIN64-NEXT:    addps %xmm8, %xmm0
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm0
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm4
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm5
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm6
+; WIN64-NEXT:    addps {{[0-9]+}}(%rsp), %xmm7
+; WIN64-NEXT:    popq %rax
+; WIN64-NEXT:    retq
+;
+; LINUXOSX-LABEL: testf32_stack:
+; LINUXOSX:       # %bb.0:
+; LINUXOSX-NEXT:    addps %xmm15, %xmm7
+; LINUXOSX-NEXT:    addps %xmm14, %xmm6
+; LINUXOSX-NEXT:    addps %xmm13, %xmm5
+; LINUXOSX-NEXT:    addps %xmm12, %xmm4
+; LINUXOSX-NEXT:    addps %xmm11, %xmm3
+; LINUXOSX-NEXT:    addps %xmm10, %xmm2
+; LINUXOSX-NEXT:    addps %xmm9, %xmm1
+; LINUXOSX-NEXT:    addps %xmm8, %xmm0
+; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm0
+; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm1
+; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm2
+; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm3
+; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm4
+; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm5
+; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm6
+; LINUXOSX-NEXT:    addps {{[0-9]+}}(%rsp), %xmm7
+; LINUXOSX-NEXT:    retq
+  %x1 = fadd <32 x float> %a, %b
+  %x2 = fadd <32 x float> %x1, %c
+  ret <32 x float> %x2
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"RegCallv4", i32 1}