[llvm] [Codegen] Spill/Restore FP/BP under option (PR #114791)

Tue Nov 5 23:36:31 PST 2024

https://github.com/mahesh-attarde updated https://github.com/llvm/llvm-project/pull/114791

>From 50144c4034f4fa023a1dd6514dfc93e2ce56f38f Mon Sep 17 00:00:00 2001
From: mattarde <mattarde at intel.com>
Date: Mon, 4 Nov 2024 05:11:17 -0800
Subject: [PATCH] Spill/Restore FP/BP under option

---
 llvm/lib/CodeGen/PrologEpilogInserter.cpp     |   8 +-
 .../X86/apx/push2-pop2-vector-register.ll     |  33 ++-
 llvm/test/CodeGen/X86/apx/push2-pop2.ll       | 175 ++++++++++++---
 llvm/test/CodeGen/X86/apx/pushp-popp.ll       |  20 +-
 llvm/test/CodeGen/X86/avx512-intel-ocl.ll     | 199 +++++++++++++++++-
 llvm/test/CodeGen/X86/clobber_base_ptr.ll     |   2 +-
 llvm/test/CodeGen/X86/clobber_frame_ptr.ll    |   2 +-
 llvm/test/CodeGen/X86/clobber_frame_ptr2.ll   |   3 +-
 .../test/CodeGen/X86/clobber_frame_ptr_x32.ll |   2 +-
 llvm/test/CodeGen/X86/i386-baseptr.ll         |   4 +-
 .../X86/inline-asm-function-call-pic.ll       |  30 ++-
 llvm/test/CodeGen/X86/x86-32-intrcc.ll        |   4 +-
 llvm/test/CodeGen/X86/x86-64-baseptr.ll       | 194 +++++++++++++++--
 13 files changed, 612 insertions(+), 64 deletions(-)

diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index ee03eaa8ae527c..da9385ce4c96d4 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -77,6 +77,11 @@ using MBBVector = SmallVector<MachineBasicBlock *, 4>;
 STATISTIC(NumLeafFuncWithSpills, "Number of leaf functions with CSRs");
 STATISTIC(NumFuncSeen, "Number of functions seen in PEI");
 
+// Experimental Feature enables spilling and reload FP/BP
+static cl::opt<bool>
+    EnableSpillFPBP("enable-spill-fpbp",
+                    cl::desc("Spill clobbered fp register to stack."),
+                    cl::init(false), cl::Hidden);
 
 namespace {
 
@@ -231,7 +236,8 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
   // Spill frame pointer and/or base pointer registers if they are clobbered.
   // It is placed before call frame instruction elimination so it will not mess
   // with stack arguments.
-  TFI->spillFPBP(MF);
+  if (EnableSpillFPBP)
+    TFI->spillFPBP(MF);
 
   // Calculate the MaxCallFrameSize value for the function's frame
   // information. Also eliminates call frame pseudo instructions.
diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll b/llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll
index f20c4c1ae27867..b4b205bd292276 100644
--- a/llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll
+++ b/llvm/test/CodeGen/X86/apx/push2-pop2-vector-register.ll
@@ -2,6 +2,7 @@
 ; Check PUSH2/POP2 is not used for vector registers
 ; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc -mattr=+push2pop2 | FileCheck %s --check-prefix=CHECK
 ; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc -mattr=+push2pop2 -frame-pointer=all | FileCheck %s --check-prefix=FRAME
+; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc -mattr=+push2pop2 -frame-pointer=all  --enable-spill-fpbp=true | FileCheck %s --check-prefix=FRAME-SPILL
 
 define void @widget(float %arg) nounwind {
 ; CHECK-LABEL: widget:
@@ -43,17 +44,41 @@ define void @widget(float %arg) nounwind {
 ; FRAME-NEXT:    xorl %r8d, %r8d
 ; FRAME-NEXT:    callq *%rsi
 ; FRAME-NEXT:    movss %xmm6, 0
-; FRAME-NEXT:    pushq %rbp
-; FRAME-NEXT:    pushq %rax
 ; FRAME-NEXT:    #APP
 ; FRAME-NEXT:    #NO_APP
-; FRAME-NEXT:    popq %rax
-; FRAME-NEXT:    popq %rbp
 ; FRAME-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; FRAME-NEXT:    addq $48, %rsp
 ; FRAME-NEXT:    pop2 %r15, %rsi
 ; FRAME-NEXT:    popq %rbp
 ; FRAME-NEXT:    retq
+;
+; FRAME-SPILL-LABEL: widget:
+; FRAME-SPILL:       # %bb.0: # %bb
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    push2 %rsi, %r15
+; FRAME-SPILL-NEXT:    subq $48, %rsp
+; FRAME-SPILL-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
+; FRAME-SPILL-NEXT:    movaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; FRAME-SPILL-NEXT:    movaps %xmm0, %xmm6
+; FRAME-SPILL-NEXT:    xorl %esi, %esi
+; FRAME-SPILL-NEXT:    xorl %ecx, %ecx
+; FRAME-SPILL-NEXT:    callq *%rsi
+; FRAME-SPILL-NEXT:    xorl %ecx, %ecx
+; FRAME-SPILL-NEXT:    xorl %edx, %edx
+; FRAME-SPILL-NEXT:    xorl %r8d, %r8d
+; FRAME-SPILL-NEXT:    callq *%rsi
+; FRAME-SPILL-NEXT:    movss %xmm6, 0
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    pushq %rax
+; FRAME-SPILL-NEXT:    #APP
+; FRAME-SPILL-NEXT:    #NO_APP
+; FRAME-SPILL-NEXT:    popq %rax
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; FRAME-SPILL-NEXT:    addq $48, %rsp
+; FRAME-SPILL-NEXT:    pop2 %r15, %rsi
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    retq
 bb:
   %call = tail call float null(ptr null)
   %call1 = tail call i32 null(ptr null, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/X86/apx/push2-pop2.ll b/llvm/test/CodeGen/X86/apx/push2-pop2.ll
index f5be484be2b1a6..d6bb1a24aa6b7b 100644
--- a/llvm/test/CodeGen/X86/apx/push2-pop2.ll
+++ b/llvm/test/CodeGen/X86/apx/push2-pop2.ll
@@ -2,6 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+push2pop2 | FileCheck %s --check-prefix=CHECK
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+push2pop2,+ppx | FileCheck %s --check-prefix=PPX
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+push2pop2 -frame-pointer=all | FileCheck %s --check-prefix=FRAME
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+push2pop2 -frame-pointer=all --enable-spill-fpbp=true | FileCheck %s --check-prefix=FRAME-SPILL
 
 define void @csr1() nounwind {
 ; CHECK-LABEL: csr1:
@@ -24,14 +25,23 @@ define void @csr1() nounwind {
 ; FRAME:       # %bb.0: # %entry
 ; FRAME-NEXT:    pushq %rbp
 ; FRAME-NEXT:    movq %rsp, %rbp
-; FRAME-NEXT:    pushq %rbp
-; FRAME-NEXT:    pushq %rax
 ; FRAME-NEXT:    #APP
 ; FRAME-NEXT:    #NO_APP
-; FRAME-NEXT:    popq %rax
-; FRAME-NEXT:    popq %rbp
 ; FRAME-NEXT:    popq %rbp
 ; FRAME-NEXT:    retq
+;
+; FRAME-SPILL-LABEL: csr1:
+; FRAME-SPILL:       # %bb.0: # %entry
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    movq %rsp, %rbp
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    pushq %rax
+; FRAME-SPILL-NEXT:    #APP
+; FRAME-SPILL-NEXT:    #NO_APP
+; FRAME-SPILL-NEXT:    popq %rax
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    retq
 entry:
   tail call void asm sideeffect "", "~{rbp},~{dirflag},~{fpsr},~{flags}"()
   ret void
@@ -63,15 +73,26 @@ define void @csr2() nounwind {
 ; FRAME-NEXT:    pushq %rbp
 ; FRAME-NEXT:    movq %rsp, %rbp
 ; FRAME-NEXT:    pushq %r15
-; FRAME-NEXT:    pushq %rbp
-; FRAME-NEXT:    pushq %rax
 ; FRAME-NEXT:    #APP
 ; FRAME-NEXT:    #NO_APP
-; FRAME-NEXT:    popq %rax
-; FRAME-NEXT:    popq %rbp
 ; FRAME-NEXT:    popq %r15
 ; FRAME-NEXT:    popq %rbp
 ; FRAME-NEXT:    retq
+;
+; FRAME-SPILL-LABEL: csr2:
+; FRAME-SPILL:       # %bb.0: # %entry
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    movq %rsp, %rbp
+; FRAME-SPILL-NEXT:    pushq %r15
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    pushq %rax
+; FRAME-SPILL-NEXT:    #APP
+; FRAME-SPILL-NEXT:    #NO_APP
+; FRAME-SPILL-NEXT:    popq %rax
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    popq %r15
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    retq
 entry:
   tail call void asm sideeffect "", "~{rbp},~{r15},~{dirflag},~{fpsr},~{flags}"()
   ret void
@@ -103,15 +124,26 @@ define void @csr3() nounwind {
 ; FRAME-NEXT:    pushq %rbp
 ; FRAME-NEXT:    movq %rsp, %rbp
 ; FRAME-NEXT:    push2 %r14, %r15
-; FRAME-NEXT:    pushq %rbp
-; FRAME-NEXT:    pushq %rax
 ; FRAME-NEXT:    #APP
 ; FRAME-NEXT:    #NO_APP
-; FRAME-NEXT:    popq %rax
-; FRAME-NEXT:    popq %rbp
 ; FRAME-NEXT:    pop2 %r15, %r14
 ; FRAME-NEXT:    popq %rbp
 ; FRAME-NEXT:    retq
+;
+; FRAME-SPILL-LABEL: csr3:
+; FRAME-SPILL:       # %bb.0: # %entry
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    movq %rsp, %rbp
+; FRAME-SPILL-NEXT:    push2 %r14, %r15
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    pushq %rax
+; FRAME-SPILL-NEXT:    #APP
+; FRAME-SPILL-NEXT:    #NO_APP
+; FRAME-SPILL-NEXT:    popq %rax
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    pop2 %r15, %r14
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    retq
 entry:
   tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{dirflag},~{fpsr},~{flags}"()
   ret void
@@ -148,16 +180,29 @@ define void @csr4() nounwind {
 ; FRAME-NEXT:    movq %rsp, %rbp
 ; FRAME-NEXT:    push2 %r14, %r15
 ; FRAME-NEXT:    pushq %r13
-; FRAME-NEXT:    pushq %rbp
-; FRAME-NEXT:    pushq %rax
 ; FRAME-NEXT:    #APP
 ; FRAME-NEXT:    #NO_APP
-; FRAME-NEXT:    popq %rax
-; FRAME-NEXT:    popq %rbp
 ; FRAME-NEXT:    popq %r13
 ; FRAME-NEXT:    pop2 %r15, %r14
 ; FRAME-NEXT:    popq %rbp
 ; FRAME-NEXT:    retq
+;
+; FRAME-SPILL-LABEL: csr4:
+; FRAME-SPILL:       # %bb.0: # %entry
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    movq %rsp, %rbp
+; FRAME-SPILL-NEXT:    push2 %r14, %r15
+; FRAME-SPILL-NEXT:    pushq %r13
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    pushq %rax
+; FRAME-SPILL-NEXT:    #APP
+; FRAME-SPILL-NEXT:    #NO_APP
+; FRAME-SPILL-NEXT:    popq %rax
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    popq %r13
+; FRAME-SPILL-NEXT:    pop2 %r15, %r14
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    retq
 entry:
   tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{dirflag},~{fpsr},~{flags}"()
   ret void
@@ -194,16 +239,29 @@ define void @csr5() nounwind {
 ; FRAME-NEXT:    movq %rsp, %rbp
 ; FRAME-NEXT:    push2 %r14, %r15
 ; FRAME-NEXT:    push2 %r12, %r13
-; FRAME-NEXT:    pushq %rbp
-; FRAME-NEXT:    pushq %rax
 ; FRAME-NEXT:    #APP
 ; FRAME-NEXT:    #NO_APP
-; FRAME-NEXT:    popq %rax
-; FRAME-NEXT:    popq %rbp
 ; FRAME-NEXT:    pop2 %r13, %r12
 ; FRAME-NEXT:    pop2 %r15, %r14
 ; FRAME-NEXT:    popq %rbp
 ; FRAME-NEXT:    retq
+;
+; FRAME-SPILL-LABEL: csr5:
+; FRAME-SPILL:       # %bb.0: # %entry
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    movq %rsp, %rbp
+; FRAME-SPILL-NEXT:    push2 %r14, %r15
+; FRAME-SPILL-NEXT:    push2 %r12, %r13
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    pushq %rax
+; FRAME-SPILL-NEXT:    #APP
+; FRAME-SPILL-NEXT:    #NO_APP
+; FRAME-SPILL-NEXT:    popq %rax
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    pop2 %r13, %r12
+; FRAME-SPILL-NEXT:    pop2 %r15, %r14
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    retq
 entry:
   tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{r12},~{dirflag},~{fpsr},~{flags}"()
   ret void
@@ -245,17 +303,32 @@ define void @csr6() nounwind {
 ; FRAME-NEXT:    push2 %r14, %r15
 ; FRAME-NEXT:    push2 %r12, %r13
 ; FRAME-NEXT:    pushq %rbx
-; FRAME-NEXT:    pushq %rbp
-; FRAME-NEXT:    pushq %rax
 ; FRAME-NEXT:    #APP
 ; FRAME-NEXT:    #NO_APP
-; FRAME-NEXT:    popq %rax
-; FRAME-NEXT:    popq %rbp
 ; FRAME-NEXT:    popq %rbx
 ; FRAME-NEXT:    pop2 %r13, %r12
 ; FRAME-NEXT:    pop2 %r15, %r14
 ; FRAME-NEXT:    popq %rbp
 ; FRAME-NEXT:    retq
+;
+; FRAME-SPILL-LABEL: csr6:
+; FRAME-SPILL:       # %bb.0: # %entry
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    movq %rsp, %rbp
+; FRAME-SPILL-NEXT:    push2 %r14, %r15
+; FRAME-SPILL-NEXT:    push2 %r12, %r13
+; FRAME-SPILL-NEXT:    pushq %rbx
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    pushq %rax
+; FRAME-SPILL-NEXT:    #APP
+; FRAME-SPILL-NEXT:    #NO_APP
+; FRAME-SPILL-NEXT:    popq %rax
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    popq %rbx
+; FRAME-SPILL-NEXT:    pop2 %r13, %r12
+; FRAME-SPILL-NEXT:    pop2 %r15, %r14
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    retq
 entry:
   tail call void asm sideeffect "", "~{rbp},~{r15},~{r14},~{r13},~{r12},~{rbx},~{dirflag},~{fpsr},~{flags}"()
   ret void
@@ -421,6 +494,60 @@ define void @lea_in_epilog(i1 %arg, ptr %arg1, ptr %arg2, i64 %arg3, i64 %arg4,
 ; FRAME-NEXT:    movq $0, (%rax)
 ; FRAME-NEXT:  .LBB6_5: # %bb14
 ; FRAME-NEXT:    retq
+;
+; FRAME-SPILL-LABEL: lea_in_epilog:
+; FRAME-SPILL:       # %bb.0: # %bb
+; FRAME-SPILL-NEXT:    testb $1, %dil
+; FRAME-SPILL-NEXT:    je .LBB6_5
+; FRAME-SPILL-NEXT:  # %bb.1: # %bb13
+; FRAME-SPILL-NEXT:    pushq %rbp
+; FRAME-SPILL-NEXT:    movq %rsp, %rbp
+; FRAME-SPILL-NEXT:    push2 %r14, %r15
+; FRAME-SPILL-NEXT:    push2 %r12, %r13
+; FRAME-SPILL-NEXT:    pushq %rbx
+; FRAME-SPILL-NEXT:    subq $24, %rsp
+; FRAME-SPILL-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FRAME-SPILL-NEXT:    addq 16(%rbp), %r9
+; FRAME-SPILL-NEXT:    movq 48(%rbp), %rbx
+; FRAME-SPILL-NEXT:    addq %r9, %rbx
+; FRAME-SPILL-NEXT:    movq 40(%rbp), %r12
+; FRAME-SPILL-NEXT:    addq %r9, %r12
+; FRAME-SPILL-NEXT:    movq 32(%rbp), %r15
+; FRAME-SPILL-NEXT:    addq %r9, %r15
+; FRAME-SPILL-NEXT:    xorl %r13d, %r13d
+; FRAME-SPILL-NEXT:    xorl %r14d, %r14d
+; FRAME-SPILL-NEXT:    movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; FRAME-SPILL-NEXT:    .p2align 4
+; FRAME-SPILL-NEXT:  .LBB6_2: # %bb15
+; FRAME-SPILL-NEXT:    # =>This Inner Loop Header: Depth=1
+; FRAME-SPILL-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; FRAME-SPILL-NEXT:    incq %r14
+; FRAME-SPILL-NEXT:    movl $432, %edx # imm = 0x1B0
+; FRAME-SPILL-NEXT:    xorl %edi, %edi
+; FRAME-SPILL-NEXT:    movq %r12, %rsi
+; FRAME-SPILL-NEXT:    callq memcpy at PLT
+; FRAME-SPILL-NEXT:    movl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 4-byte Reload
+; FRAME-SPILL-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; FRAME-SPILL-NEXT:    movq 16(%rbp), %rax
+; FRAME-SPILL-NEXT:    addq %rax, %rbx
+; FRAME-SPILL-NEXT:    addq %rax, %r12
+; FRAME-SPILL-NEXT:    addq %rax, %r15
+; FRAME-SPILL-NEXT:    addq %rax, %r9
+; FRAME-SPILL-NEXT:    addq $8, %r13
+; FRAME-SPILL-NEXT:    testb $1, %dil
+; FRAME-SPILL-NEXT:    je .LBB6_2
+; FRAME-SPILL-NEXT:  # %bb.3: # %bb11
+; FRAME-SPILL-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; FRAME-SPILL-NEXT:    leaq {{[0-9]+}}(%rsp), %rsp
+; FRAME-SPILL-NEXT:    popq %rbx
+; FRAME-SPILL-NEXT:    pop2 %r13, %r12
+; FRAME-SPILL-NEXT:    pop2 %r15, %r14
+; FRAME-SPILL-NEXT:    popq %rbp
+; FRAME-SPILL-NEXT:    jne .LBB6_5
+; FRAME-SPILL-NEXT:  # %bb.4: # %bb12
+; FRAME-SPILL-NEXT:    movq $0, (%rax)
+; FRAME-SPILL-NEXT:  .LBB6_5: # %bb14
+; FRAME-SPILL-NEXT:    retq
 bb:
   br i1 %arg, label %bb13, label %bb14
 
diff --git a/llvm/test/CodeGen/X86/apx/pushp-popp.ll b/llvm/test/CodeGen/X86/apx/pushp-popp.ll
index 625e70b07198e8..4097c59c56437b 100644
--- a/llvm/test/CodeGen/X86/apx/pushp-popp.ll
+++ b/llvm/test/CodeGen/X86/apx/pushp-popp.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ppx | FileCheck %s --check-prefix=CHECK
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ppx -frame-pointer=all | FileCheck %s --check-prefix=FRAME
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ppx -frame-pointer=all --enable-spill-fpbp=true | FileCheck %s --check-prefix=FRAME-SPILL
 
 define void @csr2() nounwind {
 ; CHECK-LABEL: csr2:
@@ -18,15 +19,26 @@ define void @csr2() nounwind {
 ; FRAME-NEXT:    pushp %rbp
 ; FRAME-NEXT:    movq %rsp, %rbp
 ; FRAME-NEXT:    pushp %r15
-; FRAME-NEXT:    pushp %rbp
-; FRAME-NEXT:    pushq %rax
 ; FRAME-NEXT:    #APP
 ; FRAME-NEXT:    #NO_APP
-; FRAME-NEXT:    popq %rax
-; FRAME-NEXT:    popp %rbp
 ; FRAME-NEXT:    popp %r15
 ; FRAME-NEXT:    popp %rbp
 ; FRAME-NEXT:    retq
+;
+; FRAME-SPILL-LABEL: csr2:
+; FRAME-SPILL:       # %bb.0: # %entry
+; FRAME-SPILL-NEXT:    pushp %rbp
+; FRAME-SPILL-NEXT:    movq %rsp, %rbp
+; FRAME-SPILL-NEXT:    pushp %r15
+; FRAME-SPILL-NEXT:    pushp %rbp
+; FRAME-SPILL-NEXT:    pushq %rax
+; FRAME-SPILL-NEXT:    #APP
+; FRAME-SPILL-NEXT:    #NO_APP
+; FRAME-SPILL-NEXT:    popq %rax
+; FRAME-SPILL-NEXT:    popp %rbp
+; FRAME-SPILL-NEXT:    popp %r15
+; FRAME-SPILL-NEXT:    popp %rbp
+; FRAME-SPILL-NEXT:    retq
 entry:
   tail call void asm sideeffect "", "~{rbp},~{r15},~{dirflag},~{fpsr},~{flags}"()
   ret void
diff --git a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
index 6c68279b8d04ae..3873b920d4de69 100644
--- a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
+++ b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
@@ -6,7 +6,9 @@
 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=knl | FileCheck %s -check-prefixes=WIN64,WIN64-KNL
 ; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=skx | FileCheck %s -check-prefixes=WIN64,WIN64-SKX
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s -check-prefixes=X64,X64-KNL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --enable-spill-fpbp=true | FileCheck %s -check-prefixes=X64-SPILL,X64-SPILL-KNL
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s -check-prefixes=X64,X64-SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx --enable-spill-fpbp=true | FileCheck %s -check-prefixes=X64-SPILL,X64-SPILL-SKX
 
 declare <16 x float> @func_float16_ptr(<16 x float>, ptr)
 declare <16 x float> @func_float16(<16 x float>, <16 x float>)
@@ -70,17 +72,35 @@ define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
 ; X64-NEXT:    subq $128, %rsp
 ; X64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; X64-NEXT:    movq %rsp, %rdi
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    pushq %rax
 ; X64-NEXT:    callq _func_float16_ptr
-; X64-NEXT:    addq $8, %rsp
-; X64-NEXT:    popq %rbp
 ; X64-NEXT:    vaddps (%rsp), %zmm0, %zmm0
 ; X64-NEXT:    leaq -16(%rbp), %rsp
 ; X64-NEXT:    popq %r12
 ; X64-NEXT:    popq %r13
 ; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
+;
+; X64-SPILL-LABEL: testf16_inp:
+; X64-SPILL:       ## %bb.0:
+; X64-SPILL-NEXT:    pushq %rbp
+; X64-SPILL-NEXT:    movq %rsp, %rbp
+; X64-SPILL-NEXT:    pushq %r13
+; X64-SPILL-NEXT:    pushq %r12
+; X64-SPILL-NEXT:    andq $-64, %rsp
+; X64-SPILL-NEXT:    subq $128, %rsp
+; X64-SPILL-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; X64-SPILL-NEXT:    movq %rsp, %rdi
+; X64-SPILL-NEXT:    pushq %rbp
+; X64-SPILL-NEXT:    pushq %rax
+; X64-SPILL-NEXT:    callq _func_float16_ptr
+; X64-SPILL-NEXT:    addq $8, %rsp
+; X64-SPILL-NEXT:    popq %rbp
+; X64-SPILL-NEXT:    vaddps (%rsp), %zmm0, %zmm0
+; X64-SPILL-NEXT:    leaq -16(%rbp), %rsp
+; X64-SPILL-NEXT:    popq %r12
+; X64-SPILL-NEXT:    popq %r13
+; X64-SPILL-NEXT:    popq %rbp
+; X64-SPILL-NEXT:    retq
   %y = alloca <16 x float>, align 64
   %x = fadd <16 x float> %a, %b
   %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, ptr %y)
@@ -154,11 +174,7 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
 ; X64-NEXT:    vmovaps %zmm1, %zmm16
 ; X64-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; X64-NEXT:    movq %rsp, %rdi
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    pushq %rax
 ; X64-NEXT:    callq _func_float16_ptr
-; X64-NEXT:    addq $8, %rsp
-; X64-NEXT:    popq %rbp
 ; X64-NEXT:    vaddps %zmm16, %zmm0, %zmm0
 ; X64-NEXT:    vaddps (%rsp), %zmm0, %zmm0
 ; X64-NEXT:    leaq -16(%rbp), %rsp
@@ -166,6 +182,30 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
 ; X64-NEXT:    popq %r13
 ; X64-NEXT:    popq %rbp
 ; X64-NEXT:    retq
+;
+; X64-SPILL-LABEL: testf16_regs:
+; X64-SPILL:       ## %bb.0:
+; X64-SPILL-NEXT:    pushq %rbp
+; X64-SPILL-NEXT:    movq %rsp, %rbp
+; X64-SPILL-NEXT:    pushq %r13
+; X64-SPILL-NEXT:    pushq %r12
+; X64-SPILL-NEXT:    andq $-64, %rsp
+; X64-SPILL-NEXT:    subq $128, %rsp
+; X64-SPILL-NEXT:    vmovaps %zmm1, %zmm16
+; X64-SPILL-NEXT:    vaddps %zmm1, %zmm0, %zmm0
+; X64-SPILL-NEXT:    movq %rsp, %rdi
+; X64-SPILL-NEXT:    pushq %rbp
+; X64-SPILL-NEXT:    pushq %rax
+; X64-SPILL-NEXT:    callq _func_float16_ptr
+; X64-SPILL-NEXT:    addq $8, %rsp
+; X64-SPILL-NEXT:    popq %rbp
+; X64-SPILL-NEXT:    vaddps %zmm16, %zmm0, %zmm0
+; X64-SPILL-NEXT:    vaddps (%rsp), %zmm0, %zmm0
+; X64-SPILL-NEXT:    leaq -16(%rbp), %rsp
+; X64-SPILL-NEXT:    popq %r12
+; X64-SPILL-NEXT:    popq %r13
+; X64-SPILL-NEXT:    popq %rbp
+; X64-SPILL-NEXT:    retq
   %y = alloca <16 x float>, align 64
   %x = fadd <16 x float> %a, %b
   %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, ptr %y)
@@ -348,6 +388,55 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
 ; X64-KNL-NEXT:    popq %rsi
 ; X64-KNL-NEXT:    retq
 ;
+; X64-SPILL-KNL-LABEL: test_prolog_epilog:
+; X64-SPILL-KNL:       ## %bb.0:
+; X64-SPILL-KNL-NEXT:    pushq %rsi
+; X64-SPILL-KNL-NEXT:    subq $1072, %rsp ## imm = 0x430
+; X64-SPILL-KNL-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; X64-SPILL-KNL-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; X64-SPILL-KNL-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; X64-SPILL-KNL-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    vmovups %zmm16, (%rsp) ## 64-byte Spill
+; X64-SPILL-KNL-NEXT:    callq _func_float16
+; X64-SPILL-KNL-NEXT:    vmovups (%rsp), %zmm16 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload
+; X64-SPILL-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
+; X64-SPILL-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; X64-SPILL-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; X64-SPILL-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
+; X64-SPILL-KNL-NEXT:    addq $1072, %rsp ## imm = 0x430
+; X64-SPILL-KNL-NEXT:    popq %rsi
+; X64-SPILL-KNL-NEXT:    retq
+;
 ; X64-SKX-LABEL: test_prolog_epilog:
 ; X64-SKX:       ## %bb.0:
 ; X64-SKX-NEXT:    pushq %rsi
@@ -396,6 +485,55 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
 ; X64-SKX-NEXT:    addq $1072, %rsp ## imm = 0x430
 ; X64-SKX-NEXT:    popq %rsi
 ; X64-SKX-NEXT:    retq
+;
+; X64-SPILL-SKX-LABEL: test_prolog_epilog:
+; X64-SPILL-SKX:       ## %bb.0:
+; X64-SPILL-SKX-NEXT:    pushq %rsi
+; X64-SPILL-SKX-NEXT:    subq $1072, %rsp ## imm = 0x430
+; X64-SPILL-SKX-NEXT:    kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-SPILL-SKX-NEXT:    kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-SPILL-SKX-NEXT:    kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-SPILL-SKX-NEXT:    kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    vmovups %zmm16, (%rsp) ## 64-byte Spill
+; X64-SPILL-SKX-NEXT:    callq _func_float16
+; X64-SPILL-SKX-NEXT:    vmovups (%rsp), %zmm16 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload
+; X64-SPILL-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 8-byte Reload
+; X64-SPILL-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 8-byte Reload
+; X64-SPILL-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 8-byte Reload
+; X64-SPILL-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 8-byte Reload
+; X64-SPILL-SKX-NEXT:    addq $1072, %rsp ## imm = 0x430
+; X64-SPILL-SKX-NEXT:    popq %rsi
+; X64-SPILL-SKX-NEXT:    retq
    %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
    ret <16 x float> %c
 }
@@ -465,6 +603,24 @@ define <16 x float> @testf16_inp_mask(<16 x float> %a, i16 %mask)  {
 ; X64-KNL-NEXT:    popq %rbp
 ; X64-KNL-NEXT:    retq
 ;
+; X64-SPILL-KNL-LABEL: testf16_inp_mask:
+; X64-SPILL-KNL:       ## %bb.0:
+; X64-SPILL-KNL-NEXT:    pushq %rbp
+; X64-SPILL-KNL-NEXT:    .cfi_def_cfa_offset 16
+; X64-SPILL-KNL-NEXT:    pushq %r13
+; X64-SPILL-KNL-NEXT:    .cfi_def_cfa_offset 24
+; X64-SPILL-KNL-NEXT:    pushq %r12
+; X64-SPILL-KNL-NEXT:    .cfi_def_cfa_offset 32
+; X64-SPILL-KNL-NEXT:    .cfi_offset %r12, -32
+; X64-SPILL-KNL-NEXT:    .cfi_offset %r13, -24
+; X64-SPILL-KNL-NEXT:    .cfi_offset %rbp, -16
+; X64-SPILL-KNL-NEXT:    kmovw %edi, %k1
+; X64-SPILL-KNL-NEXT:    callq _func_float16_mask
+; X64-SPILL-KNL-NEXT:    popq %r12
+; X64-SPILL-KNL-NEXT:    popq %r13
+; X64-SPILL-KNL-NEXT:    popq %rbp
+; X64-SPILL-KNL-NEXT:    retq
+;
 ; X64-SKX-LABEL: testf16_inp_mask:
 ; X64-SKX:       ## %bb.0:
 ; X64-SKX-NEXT:    pushq %rbp
@@ -482,6 +638,24 @@ define <16 x float> @testf16_inp_mask(<16 x float> %a, i16 %mask)  {
 ; X64-SKX-NEXT:    popq %r13
 ; X64-SKX-NEXT:    popq %rbp
 ; X64-SKX-NEXT:    retq
+;
+; X64-SPILL-SKX-LABEL: testf16_inp_mask:
+; X64-SPILL-SKX:       ## %bb.0:
+; X64-SPILL-SKX-NEXT:    pushq %rbp
+; X64-SPILL-SKX-NEXT:    .cfi_def_cfa_offset 16
+; X64-SPILL-SKX-NEXT:    pushq %r13
+; X64-SPILL-SKX-NEXT:    .cfi_def_cfa_offset 24
+; X64-SPILL-SKX-NEXT:    pushq %r12
+; X64-SPILL-SKX-NEXT:    .cfi_def_cfa_offset 32
+; X64-SPILL-SKX-NEXT:    .cfi_offset %r12, -32
+; X64-SPILL-SKX-NEXT:    .cfi_offset %r13, -24
+; X64-SPILL-SKX-NEXT:    .cfi_offset %rbp, -16
+; X64-SPILL-SKX-NEXT:    kmovd %edi, %k1
+; X64-SPILL-SKX-NEXT:    callq _func_float16_mask
+; X64-SPILL-SKX-NEXT:    popq %r12
+; X64-SPILL-SKX-NEXT:    popq %r13
+; X64-SPILL-SKX-NEXT:    popq %rbp
+; X64-SPILL-SKX-NEXT:    retq
   %imask = bitcast i16 %mask to <16 x i1>
   %1 = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1> %imask)
   ret <16 x float> %1
@@ -521,6 +695,15 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog_with_mask(<16 x float> %a
 ; X64-NEXT:    callq _func_float16_mask
 ; X64-NEXT:    popq %rax
 ; X64-NEXT:    retq
+;
+; X64-SPILL-LABEL: test_prolog_epilog_with_mask:
+; X64-SPILL:       ## %bb.0:
+; X64-SPILL-NEXT:    pushq %rax
+; X64-SPILL-NEXT:    vpcmpeqd %zmm2, %zmm1, %k0
+; X64-SPILL-NEXT:    kxorw %k1, %k0, %k1
+; X64-SPILL-NEXT:    callq _func_float16_mask
+; X64-SPILL-NEXT:    popq %rax
+; X64-SPILL-NEXT:    retq
    %cmp_res = icmp eq <16 x i32>%x1, %x2
    %mask1 = xor <16 x i1> %cmp_res, %mask
    %c = call intel_ocl_bicc <16 x float> @func_float16_mask(<16 x float> %a, <16 x i1>%mask1)
diff --git a/llvm/test/CodeGen/X86/clobber_base_ptr.ll b/llvm/test/CodeGen/X86/clobber_base_ptr.ll
index 2c39560f02d160..343487b609d907 100644
--- a/llvm/test/CodeGen/X86/clobber_base_ptr.ll
+++ b/llvm/test/CodeGen/X86/clobber_base_ptr.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s | FileCheck %s
+; RUN: llc --enable-spill-fpbp=true < %s | FileCheck %s
 
 target datalayout = "e-m:x-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:32-n8:16:32-a:0:32-S32"
 target triple = "i386-pc-windows-gnu"
diff --git a/llvm/test/CodeGen/X86/clobber_frame_ptr.ll b/llvm/test/CodeGen/X86/clobber_frame_ptr.ll
index f6b38839d13cc2..6403fac5a52412 100644
--- a/llvm/test/CodeGen/X86/clobber_frame_ptr.ll
+++ b/llvm/test/CodeGen/X86/clobber_frame_ptr.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=x86_64-pc-linux -stackrealign -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux -stackrealign --enable-spill-fpbp=true -verify-machineinstrs < %s | FileCheck %s
 
 ; Calling convention ghccc uses ebp to pass parameter, so calling a function
 ; using ghccc clobbers ebp. We should save and restore ebp around such a call
diff --git a/llvm/test/CodeGen/X86/clobber_frame_ptr2.ll b/llvm/test/CodeGen/X86/clobber_frame_ptr2.ll
index 0551152a0718d7..51d02d1b9c1e23 100644
--- a/llvm/test/CodeGen/X86/clobber_frame_ptr2.ll
+++ b/llvm/test/CodeGen/X86/clobber_frame_ptr2.ll
@@ -1,4 +1,5 @@
-; RUN: not llc -mtriple=x86_64-pc-linux -stackrealign -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: not llc -mtriple=x86_64-pc-linux -stackrealign --enable-spill-fpbp=true -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
 
 declare cc 11 i64 @hipe2(i64, i64, i64, i64, i64, i64, i64)
 
diff --git a/llvm/test/CodeGen/X86/clobber_frame_ptr_x32.ll b/llvm/test/CodeGen/X86/clobber_frame_ptr_x32.ll
index 25c951d8b1a109..b4b39d91dc5c93 100644
--- a/llvm/test/CodeGen/X86/clobber_frame_ptr_x32.ll
+++ b/llvm/test/CodeGen/X86/clobber_frame_ptr_x32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc < %s | FileCheck %s
+; RUN: llc --enable-spill-fpbp=true < %s | FileCheck %s
 
 target triple = "x86_64-linux-gnux32"
 
diff --git a/llvm/test/CodeGen/X86/i386-baseptr.ll b/llvm/test/CodeGen/X86/i386-baseptr.ll
index 777eb838b84cc7..a4a5a919560e5a 100644
--- a/llvm/test/CodeGen/X86/i386-baseptr.ll
+++ b/llvm/test/CodeGen/X86/i386-baseptr.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=i386-pc-linux -stackrealign -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -mtriple=i386-pc-none-elf -stackrealign -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=i386-pc-linux -stackrealign --enable-spill-fpbp=true -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=i386-pc-none-elf -stackrealign --enable-spill-fpbp=true -verify-machineinstrs < %s | FileCheck %s
 
 declare i32 @helper() nounwind
 define void @base() #0 {
diff --git a/llvm/test/CodeGen/X86/inline-asm-function-call-pic.ll b/llvm/test/CodeGen/X86/inline-asm-function-call-pic.ll
index d3ca872509ad5a..c1d611e6810694 100644
--- a/llvm/test/CodeGen/X86/inline-asm-function-call-pic.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-function-call-pic.ll
@@ -1,4 +1,5 @@
-; RUN: llc -O2 --relocation-model=pic -mtriple=i386-unknown-linux-gnu < %s 2>&1 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -O2 --relocation-model=pic -mtriple=i386-unknown-linux-gnu --enable-spill-fpbp=true < %s 2>&1 | FileCheck %s
 
 ; List the source code:
 ; // clang  -m32 -fasm-blocks -S t.c -O2  -fpic -emit-llvm
@@ -31,7 +32,14 @@
 
 define void @func() local_unnamed_addr #0 {
 ; CHECK-LABEL: func:
-; CHECK:         calll .L0$pb
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    movl %esp, %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:    calll .L0$pb
 ; CHECK-NEXT:  .L0$pb:
 ; CHECK-NEXT:    popl %ebx
 ; CHECK-NEXT:  .Ltmp0:
@@ -56,6 +64,12 @@ define void @func() local_unnamed_addr #0 {
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    addl $12, %esp
 ; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
 entry:
   %call = tail call i32 @static_func()
 ;; We test call, CALL, and jmp.
@@ -66,6 +80,18 @@ entry:
 declare i32 @extern_func(...) #0
 
 define internal i32 @static_func() #0 {
+; CHECK-LABEL: static_func:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    calll .L1$pb
+; CHECK-NEXT:  .L1$pb:
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp1-.L1$pb), %eax
+; CHECK-NEXT:    movl GV at GOT(%eax), %ecx
+; CHECK-NEXT:    movl (%ecx), %eax
+; CHECK-NEXT:    leal 1(%eax), %edx
+; CHECK-NEXT:    movl %edx, (%ecx)
+; CHECK-NEXT:    retl
 entry:
   %0 = load i32, ptr @GV, align 4
   %inc = add nsw i32 %0, 1
diff --git a/llvm/test/CodeGen/X86/x86-32-intrcc.ll b/llvm/test/CodeGen/X86/x86-32-intrcc.ll
index a0f937e2c323b6..459b659aa71fed 100644
--- a/llvm/test/CodeGen/X86/x86-32-intrcc.ll
+++ b/llvm/test/CodeGen/X86/x86-32-intrcc.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_sp --version 2
 ; RUN: llc -mtriple=i686-unknown-unknown < %s | FileCheck %s
-; RUN: llc -mtriple=i686-unknown-unknown -O0 < %s | FileCheck %s -check-prefix=CHECK0
+; RUN: llc -mtriple=i686-unknown-unknown -O0 --enable-spill-fpbp=true < %s | FileCheck %s -check-prefix=CHECK0
 
 %struct.interrupt_frame = type { i32, i32, i32, i32, i32 }
 
@@ -108,10 +108,8 @@ define x86_intrcc void @test_isr_clobbers(ptr byval(%struct.interrupt_frame) %fr
 ; CHECK-NEXT:    pushl %eax
 ; CHECK-NEXT:    andl $-16, %esp
 ; CHECK-NEXT:    cld
-; CHECK-NEXT:    pushl %ebp
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    #NO_APP
-; CHECK-NEXT:    popl %ebp
 ; CHECK-NEXT:    leal -12(%ebp), %esp
 ; CHECK-NEXT:    popl %eax
 ; CHECK-NEXT:    popl %ebx
diff --git a/llvm/test/CodeGen/X86/x86-64-baseptr.ll b/llvm/test/CodeGen/X86/x86-64-baseptr.ll
index 020004def6e7ad..cb3d9580db51e3 100644
--- a/llvm/test/CodeGen/X86/x86-64-baseptr.ll
+++ b/llvm/test/CodeGen/X86/x86-64-baseptr.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=x86_64-pc-linux -stackrealign -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnux32 -stackrealign -verify-machineinstrs --enable-spill-fpbp=true < %s | FileCheck -check-prefix=X32ABI-SPILL %s
 ; RUN: llc -mtriple=x86_64-pc-linux-gnux32 -stackrealign -verify-machineinstrs < %s | FileCheck -check-prefix=X32ABI %s
 
 ; This should run with NaCl as well ( -mtriple=x86_64-pc-nacl ) but currently doesn't due to PR22655
@@ -40,6 +41,34 @@ define void @base() #0 {
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
 ; CHECK-NEXT:    retq
 ;
+; X32ABI-SPILL-LABEL: base:
+; X32ABI-SPILL:       # %bb.0: # %entry
+; X32ABI-SPILL-NEXT:    pushq %rbp
+; X32ABI-SPILL-NEXT:    .cfi_def_cfa_offset 16
+; X32ABI-SPILL-NEXT:    .cfi_offset %rbp, -16
+; X32ABI-SPILL-NEXT:    movl %esp, %ebp
+; X32ABI-SPILL-NEXT:    .cfi_def_cfa_register %rbp
+; X32ABI-SPILL-NEXT:    pushq %rbx
+; X32ABI-SPILL-NEXT:    andl $-32, %esp
+; X32ABI-SPILL-NEXT:    subl $32, %esp
+; X32ABI-SPILL-NEXT:    movl %esp, %ebx
+; X32ABI-SPILL-NEXT:    .cfi_offset %rbx, -24
+; X32ABI-SPILL-NEXT:    callq helper at PLT
+; X32ABI-SPILL-NEXT:    # kill: def $eax killed $eax def $rax
+; X32ABI-SPILL-NEXT:    leal 31(,%rax,4), %eax
+; X32ABI-SPILL-NEXT:    andl $-32, %eax
+; X32ABI-SPILL-NEXT:    movl %esp, %ecx
+; X32ABI-SPILL-NEXT:    movl %ecx, %edx
+; X32ABI-SPILL-NEXT:    subl %eax, %edx
+; X32ABI-SPILL-NEXT:    negl %eax
+; X32ABI-SPILL-NEXT:    movl %edx, %esp
+; X32ABI-SPILL-NEXT:    movl $0, (%ecx,%eax)
+; X32ABI-SPILL-NEXT:    leal -8(%ebp), %esp
+; X32ABI-SPILL-NEXT:    popq %rbx
+; X32ABI-SPILL-NEXT:    popq %rbp
+; X32ABI-SPILL-NEXT:    .cfi_def_cfa %rsp, 8
+; X32ABI-SPILL-NEXT:    retq
+;
 ; X32ABI-LABEL: base:
 ; X32ABI:       # %bb.0: # %entry
 ; X32ABI-NEXT:    pushq %rbp
@@ -115,6 +144,46 @@ define void @clobber_base() #0 {
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
 ; CHECK-NEXT:    retq
 ;
+; X32ABI-SPILL-LABEL: clobber_base:
+; X32ABI-SPILL:       # %bb.0: # %entry
+; X32ABI-SPILL-NEXT:    pushq %rbp
+; X32ABI-SPILL-NEXT:    .cfi_def_cfa_offset 16
+; X32ABI-SPILL-NEXT:    .cfi_offset %rbp, -16
+; X32ABI-SPILL-NEXT:    movl %esp, %ebp
+; X32ABI-SPILL-NEXT:    .cfi_def_cfa_register %rbp
+; X32ABI-SPILL-NEXT:    pushq %rbx
+; X32ABI-SPILL-NEXT:    andl $-128, %esp
+; X32ABI-SPILL-NEXT:    subl $128, %esp
+; X32ABI-SPILL-NEXT:    movl %esp, %ebx
+; X32ABI-SPILL-NEXT:    .cfi_offset %rbx, -24
+; X32ABI-SPILL-NEXT:    callq helper at PLT
+; X32ABI-SPILL-NEXT:    # kill: def $eax killed $eax def $rax
+; X32ABI-SPILL-NEXT:    leal 31(,%rax,4), %eax
+; X32ABI-SPILL-NEXT:    andl $-32, %eax
+; X32ABI-SPILL-NEXT:    movl %esp, %ecx
+; X32ABI-SPILL-NEXT:    movl %ecx, %edx
+; X32ABI-SPILL-NEXT:    subl %eax, %edx
+; X32ABI-SPILL-NEXT:    negl %eax
+; X32ABI-SPILL-NEXT:    movl %edx, %esp
+; X32ABI-SPILL-NEXT:    pushq %rbx
+; X32ABI-SPILL-NEXT:    subl $24, %esp
+; X32ABI-SPILL-NEXT:    movl $405, %ebx # imm = 0x195
+; X32ABI-SPILL-NEXT:    #APP
+; X32ABI-SPILL-NEXT:    nop
+; X32ABI-SPILL-NEXT:    #NO_APP
+; X32ABI-SPILL-NEXT:    addl $24, %esp
+; X32ABI-SPILL-NEXT:    popq %rbx
+; X32ABI-SPILL-NEXT:    movl $8, %edx
+; X32ABI-SPILL-NEXT:    #APP
+; X32ABI-SPILL-NEXT:    movl %edx, (%ebx)
+; X32ABI-SPILL-NEXT:    #NO_APP
+; X32ABI-SPILL-NEXT:    movl $0, (%ecx,%eax)
+; X32ABI-SPILL-NEXT:    leal -8(%ebp), %esp
+; X32ABI-SPILL-NEXT:    popq %rbx
+; X32ABI-SPILL-NEXT:    popq %rbp
+; X32ABI-SPILL-NEXT:    .cfi_def_cfa %rsp, 8
+; X32ABI-SPILL-NEXT:    retq
+;
 ; X32ABI-LABEL: clobber_base:
 ; X32ABI:       # %bb.0: # %entry
 ; X32ABI-NEXT:    pushq %rbp
@@ -136,14 +205,10 @@ define void @clobber_base() #0 {
 ; X32ABI-NEXT:    subl %eax, %edx
 ; X32ABI-NEXT:    negl %eax
 ; X32ABI-NEXT:    movl %edx, %esp
-; X32ABI-NEXT:    pushq %rbx
-; X32ABI-NEXT:    subl $24, %esp
 ; X32ABI-NEXT:    movl $405, %ebx # imm = 0x195
 ; X32ABI-NEXT:    #APP
 ; X32ABI-NEXT:    nop
 ; X32ABI-NEXT:    #NO_APP
-; X32ABI-NEXT:    addl $24, %esp
-; X32ABI-NEXT:    popq %rbx
 ; X32ABI-NEXT:    movl $8, %edx
 ; X32ABI-NEXT:    #APP
 ; X32ABI-NEXT:    movl %edx, (%ebx)
@@ -234,6 +299,74 @@ define x86_regcallcc void @clobber_baseptr_argptr(i32 %param1, i32 %param2, i32
 ; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
 ; CHECK-NEXT:    retq
 ;
+; X32ABI-SPILL-LABEL: clobber_baseptr_argptr:
+; X32ABI-SPILL:       # %bb.0: # %entry
+; X32ABI-SPILL-NEXT:    pushq %rbp
+; X32ABI-SPILL-NEXT:    .cfi_def_cfa_offset 16
+; X32ABI-SPILL-NEXT:    .cfi_offset %rbp, -16
+; X32ABI-SPILL-NEXT:    movl %esp, %ebp
+; X32ABI-SPILL-NEXT:    .cfi_def_cfa_register %rbp
+; X32ABI-SPILL-NEXT:    pushq %rbx
+; X32ABI-SPILL-NEXT:    andl $-128, %esp
+; X32ABI-SPILL-NEXT:    subl $256, %esp # imm = 0x100
+; X32ABI-SPILL-NEXT:    movaps %xmm15, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X32ABI-SPILL-NEXT:    movaps %xmm14, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X32ABI-SPILL-NEXT:    movaps %xmm13, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X32ABI-SPILL-NEXT:    movaps %xmm12, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X32ABI-SPILL-NEXT:    movaps %xmm11, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X32ABI-SPILL-NEXT:    movaps %xmm10, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X32ABI-SPILL-NEXT:    movaps %xmm9, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X32ABI-SPILL-NEXT:    movaps %xmm8, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X32ABI-SPILL-NEXT:    movl %esp, %ebx
+; X32ABI-SPILL-NEXT:    .cfi_offset %rbx, -24
+; X32ABI-SPILL-NEXT:    .cfi_offset %xmm8, -160
+; X32ABI-SPILL-NEXT:    .cfi_offset %xmm9, -144
+; X32ABI-SPILL-NEXT:    .cfi_offset %xmm10, -128
+; X32ABI-SPILL-NEXT:    .cfi_offset %xmm11, -112
+; X32ABI-SPILL-NEXT:    .cfi_offset %xmm12, -96
+; X32ABI-SPILL-NEXT:    .cfi_offset %xmm13, -80
+; X32ABI-SPILL-NEXT:    .cfi_offset %xmm14, -64
+; X32ABI-SPILL-NEXT:    .cfi_offset %xmm15, -48
+; X32ABI-SPILL-NEXT:    movl 16(%ebp), %r14d
+; X32ABI-SPILL-NEXT:    callq helper at PLT
+; X32ABI-SPILL-NEXT:    # kill: def $eax killed $eax def $rax
+; X32ABI-SPILL-NEXT:    leal 31(,%rax,4), %eax
+; X32ABI-SPILL-NEXT:    andl $-32, %eax
+; X32ABI-SPILL-NEXT:    movl %esp, %ecx
+; X32ABI-SPILL-NEXT:    movl %ecx, %edx
+; X32ABI-SPILL-NEXT:    subl %eax, %edx
+; X32ABI-SPILL-NEXT:    negl %eax
+; X32ABI-SPILL-NEXT:    movl %edx, %esp
+; X32ABI-SPILL-NEXT:    pushq %rbx
+; X32ABI-SPILL-NEXT:    subl $24, %esp
+; X32ABI-SPILL-NEXT:    movl $405, %ebx # imm = 0x195
+; X32ABI-SPILL-NEXT:    #APP
+; X32ABI-SPILL-NEXT:    nop
+; X32ABI-SPILL-NEXT:    #NO_APP
+; X32ABI-SPILL-NEXT:    #APP
+; X32ABI-SPILL-NEXT:    nop
+; X32ABI-SPILL-NEXT:    #NO_APP
+; X32ABI-SPILL-NEXT:    addl $24, %esp
+; X32ABI-SPILL-NEXT:    popq %rbx
+; X32ABI-SPILL-NEXT:    movl $8, %edx
+; X32ABI-SPILL-NEXT:    #APP
+; X32ABI-SPILL-NEXT:    movl %edx, (%ebx)
+; X32ABI-SPILL-NEXT:    #NO_APP
+; X32ABI-SPILL-NEXT:    movl %r14d, (%ecx,%eax)
+; X32ABI-SPILL-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm8 # 16-byte Reload
+; X32ABI-SPILL-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm9 # 16-byte Reload
+; X32ABI-SPILL-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm10 # 16-byte Reload
+; X32ABI-SPILL-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm11 # 16-byte Reload
+; X32ABI-SPILL-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm12 # 16-byte Reload
+; X32ABI-SPILL-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm13 # 16-byte Reload
+; X32ABI-SPILL-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm14 # 16-byte Reload
+; X32ABI-SPILL-NEXT:    movaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm15 # 16-byte Reload
+; X32ABI-SPILL-NEXT:    leal -8(%ebp), %esp
+; X32ABI-SPILL-NEXT:    popq %rbx
+; X32ABI-SPILL-NEXT:    popq %rbp
+; X32ABI-SPILL-NEXT:    .cfi_def_cfa %rsp, 8
+; X32ABI-SPILL-NEXT:    retq
+;
 ; X32ABI-LABEL: clobber_baseptr_argptr:
 ; X32ABI:       # %bb.0: # %entry
 ; X32ABI-NEXT:    pushq %rbp
@@ -272,8 +405,6 @@ define x86_regcallcc void @clobber_baseptr_argptr(i32 %param1, i32 %param2, i32
 ; X32ABI-NEXT:    subl %eax, %edx
 ; X32ABI-NEXT:    negl %eax
 ; X32ABI-NEXT:    movl %edx, %esp
-; X32ABI-NEXT:    pushq %rbx
-; X32ABI-NEXT:    subl $24, %esp
 ; X32ABI-NEXT:    movl $405, %ebx # imm = 0x195
 ; X32ABI-NEXT:    #APP
 ; X32ABI-NEXT:    nop
@@ -281,8 +412,6 @@ define x86_regcallcc void @clobber_baseptr_argptr(i32 %param1, i32 %param2, i32
 ; X32ABI-NEXT:    #APP
 ; X32ABI-NEXT:    nop
 ; X32ABI-NEXT:    #NO_APP
-; X32ABI-NEXT:    addl $24, %esp
-; X32ABI-NEXT:    popq %rbx
 ; X32ABI-NEXT:    movl $8, %edx
 ; X32ABI-NEXT:    #APP
 ; X32ABI-NEXT:    movl %edx, (%ebx)
@@ -361,6 +490,51 @@ define void @vmw_host_printf(ptr %fmt, ...) nounwind {
 ; CHECK-NEXT:    leaq -8(%r10), %rsp
 ; CHECK-NEXT:    retq
 ;
+; X32ABI-SPILL-LABEL: vmw_host_printf:
+; X32ABI-SPILL:       # %bb.0: # %entry
+; X32ABI-SPILL-NEXT:    pushq %rbp
+; X32ABI-SPILL-NEXT:    movl %esp, %ebp
+; X32ABI-SPILL-NEXT:    pushq %rbx
+; X32ABI-SPILL-NEXT:    andl $-16, %esp
+; X32ABI-SPILL-NEXT:    subl $208, %esp
+; X32ABI-SPILL-NEXT:    movl %esp, %ebx
+; X32ABI-SPILL-NEXT:    movq %rsi, 24(%ebx)
+; X32ABI-SPILL-NEXT:    movq %rdx, 32(%ebx)
+; X32ABI-SPILL-NEXT:    movq %rcx, 40(%ebx)
+; X32ABI-SPILL-NEXT:    movq %r8, 48(%ebx)
+; X32ABI-SPILL-NEXT:    movq %r9, 56(%ebx)
+; X32ABI-SPILL-NEXT:    testb %al, %al
+; X32ABI-SPILL-NEXT:    je .LBB3_2
+; X32ABI-SPILL-NEXT:  # %bb.1: # %entry
+; X32ABI-SPILL-NEXT:    movaps %xmm0, 64(%ebx)
+; X32ABI-SPILL-NEXT:    movaps %xmm1, 80(%ebx)
+; X32ABI-SPILL-NEXT:    movaps %xmm2, 96(%ebx)
+; X32ABI-SPILL-NEXT:    movaps %xmm3, 112(%ebx)
+; X32ABI-SPILL-NEXT:    movaps %xmm4, 128(%ebx)
+; X32ABI-SPILL-NEXT:    movaps %xmm5, 144(%ebx)
+; X32ABI-SPILL-NEXT:    movaps %xmm6, 160(%ebx)
+; X32ABI-SPILL-NEXT:    movaps %xmm7, 176(%ebx)
+; X32ABI-SPILL-NEXT:  .LBB3_2: # %entry
+; X32ABI-SPILL-NEXT:    leal 16(%rbx), %eax
+; X32ABI-SPILL-NEXT:    movl %eax, (%eax)
+; X32ABI-SPILL-NEXT:    leal 16(%rbp), %eax
+; X32ABI-SPILL-NEXT:    movl %eax, (%eax)
+; X32ABI-SPILL-NEXT:    movl $48, (%eax)
+; X32ABI-SPILL-NEXT:    movl $8, (%eax)
+; X32ABI-SPILL-NEXT:    xorl %eax, %eax
+; X32ABI-SPILL-NEXT:    pushq %rbx
+; X32ABI-SPILL-NEXT:    subl $24, %esp
+; X32ABI-SPILL-NEXT:    xorl %ebx, %ebx
+; X32ABI-SPILL-NEXT:    xorl %ecx, %ecx
+; X32ABI-SPILL-NEXT:    #APP
+; X32ABI-SPILL-NEXT:    #NO_APP
+; X32ABI-SPILL-NEXT:    addl $24, %esp
+; X32ABI-SPILL-NEXT:    popq %rbx
+; X32ABI-SPILL-NEXT:    leal -8(%ebp), %esp
+; X32ABI-SPILL-NEXT:    popq %rbx
+; X32ABI-SPILL-NEXT:    popq %rbp
+; X32ABI-SPILL-NEXT:    retq
+;
 ; X32ABI-LABEL: vmw_host_printf:
 ; X32ABI:       # %bb.0: # %entry
 ; X32ABI-NEXT:    pushq %rbp
@@ -393,14 +567,10 @@ define void @vmw_host_printf(ptr %fmt, ...) nounwind {
 ; X32ABI-NEXT:    movl $48, (%eax)
 ; X32ABI-NEXT:    movl $8, (%eax)
 ; X32ABI-NEXT:    xorl %eax, %eax
-; X32ABI-NEXT:    pushq %rbx
-; X32ABI-NEXT:    subl $24, %esp
 ; X32ABI-NEXT:    xorl %ebx, %ebx
 ; X32ABI-NEXT:    xorl %ecx, %ecx
 ; X32ABI-NEXT:    #APP
 ; X32ABI-NEXT:    #NO_APP
-; X32ABI-NEXT:    addl $24, %esp
-; X32ABI-NEXT:    popq %rbx
 ; X32ABI-NEXT:    leal -8(%ebp), %esp
 ; X32ABI-NEXT:    popq %rbx
 ; X32ABI-NEXT:    popq %rbp