[llvm-branch-commits] [llvm] 9a9b649 - [WinEH] Allocate space in funclets stack to save XMM CSRs

Reid Kleckner via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Tue Nov 5 12:53:54 PST 2019


Author: Pengfei Wang
Date: 2019-11-05T12:52:54-08:00
New Revision: 9a9b6492a66c3f83e58f5b4e451797b6baf7f3ea

URL: https://github.com/llvm/llvm-project/commit/9a9b6492a66c3f83e58f5b4e451797b6baf7f3ea
DIFF: https://github.com/llvm/llvm-project/commit/9a9b6492a66c3f83e58f5b4e451797b6baf7f3ea.diff

LOG: [WinEH] Allocate space in funclets stack to save XMM CSRs

Summary:
This is an alternate approach to D63396

Currently funclets reuse the same stack slots that are used in the
parent function for saving callee-saved xmm registers. If the parent
function modifies a callee-saved xmm register before an excpetion is
thrown, the catch handler will overwrite the original saved value.

This patch allocates space in funclets stack for saving callee-saved xmm
registers and uses RSP instead RBP to access memory.

Signed-off-by: Pengfei Wang <pengfei.wang at intel.com>

Reviewers: rnk, RKSimon, craig.topper, annita.zhang, LuoYuanke, andrew.w.kaylor

Subscribers: hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D66596

Signed-off-by: Pengfei Wang <pengfei.wang at intel.com>
llvm-svn: 370005
(cherry picked from commit 564fb58a32a808c34d809820d00e2f23c0307a71)

Added: 
    llvm/test/CodeGen/X86/win64-funclet-savexmm.ll

Modified: 
    llvm/lib/Target/X86/X86FrameLowering.cpp
    llvm/lib/Target/X86/X86FrameLowering.h
    llvm/lib/Target/X86/X86MachineFunctionInfo.h
    llvm/lib/Target/X86/X86RegisterInfo.cpp
    llvm/test/CodeGen/X86/avx512-intel-ocl.ll
    llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll
    llvm/test/CodeGen/X86/x86-interrupt_cc.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index e310fe069117..854156b2bc8e 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -1396,9 +1396,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       int FI;
       if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
         if (X86::FR64RegClass.contains(Reg)) {
+          int Offset;
           unsigned IgnoredFrameReg;
-          int Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg);
-          Offset += SEHFrameOffset;
+          if (IsWin64Prologue && IsFunclet)
+            Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg);
+          else
+            Offset = getFrameIndexReference(MF, FI, IgnoredFrameReg) +
+                     SEHFrameOffset;
 
           HasWinCFI = true;
           assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
@@ -1554,9 +1558,13 @@ X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
 
 unsigned
 X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
+  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   // This is the size of the pushed CSRs.
-  unsigned CSSize =
-      MF.getInfo<X86MachineFunctionInfo>()->getCalleeSavedFrameSize();
+  unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+  // This is the size of callee saved XMMs.
+  const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
+  unsigned XMMSize = WinEHXMMSlotInfo.size() *
+                     TRI->getSpillSize(X86::VR128RegClass);
   // This is the amount of stack a funclet needs to allocate.
   unsigned UsedSize;
   EHPersonality Personality =
@@ -1576,7 +1584,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
   unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment());
   // Subtract out the size of the callee saved registers. This is how much stack
   // each funclet will allocate.
-  return FrameSizeMinusRBP - CSSize;
+  return FrameSizeMinusRBP + XMMSize - CSSize;
 }
 
 static bool isTailCallOpcode(unsigned Opc) {
@@ -1850,6 +1858,20 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   return Offset + FPDelta;
 }
 
+int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF,
+                                              int FI, unsigned &FrameReg) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
+  const auto it = WinEHXMMSlotInfo.find(FI);
+
+  if (it == WinEHXMMSlotInfo.end())
+    return getFrameIndexReference(MF, FI, FrameReg);
+
+  FrameReg = TRI->getStackRegister();
+  return alignTo(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second;
+}
+
 int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF,
                                                int FI, unsigned &FrameReg,
                                                int Adjustment) const {
@@ -1948,6 +1970,8 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
   X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
 
   unsigned CalleeSavedFrameSize = 0;
+  unsigned XMMCalleeSavedFrameSize = 0;
+  auto &WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
   int SpillSlotOffset = getOffsetOfLocalArea() + X86FI->getTCReturnAddrDelta();
 
   int64_t TailCallReturnAddrDelta = X86FI->getTCReturnAddrDelta();
@@ -2025,12 +2049,20 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
     unsigned Size = TRI->getSpillSize(*RC);
     unsigned Align = TRI->getSpillAlignment(*RC);
     // ensure alignment
-    SpillSlotOffset -= std::abs(SpillSlotOffset) % Align;
+    assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86");
+    SpillSlotOffset = -alignTo(-SpillSlotOffset, Align);
+
     // spill into slot
     SpillSlotOffset -= Size;
     int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
     CSI[i - 1].setFrameIdx(SlotIndex);
     MFI.ensureMaxAlignment(Align);
+
+    // Save the start offset and size of XMM in stack frame for funclets.
+    if (X86::VR128RegClass.contains(Reg)) {
+      WinEHXMMSlotInfo[SlotIndex] = XMMCalleeSavedFrameSize;
+      XMMCalleeSavedFrameSize += Size;
+    }
   }
 
   return true;

diff  --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index d32746e3a36e..c5218cc09b8a 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -99,6 +99,8 @@ class X86FrameLowering : public TargetFrameLowering {
   int getFrameIndexReference(const MachineFunction &MF, int FI,
                              unsigned &FrameReg) const override;
 
+  int getWin64EHFrameIndexRef(const MachineFunction &MF,
+                              int FI, unsigned &SPReg) const;
   int getFrameIndexReferenceSP(const MachineFunction &MF,
                                int FI, unsigned &SPReg, int Adjustment) const;
   int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,

diff  --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index d7e535598d81..5cb80a082b56 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -36,6 +36,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// is stashed.
   signed char RestoreBasePointerOffset = 0;
 
+  /// WinEHXMMSlotInfo - Slot information of XMM registers in the stack frame
+  /// in bytes.
+  DenseMap<int, unsigned> WinEHXMMSlotInfo;
+
   /// CalleeSavedFrameSize - Size of the callee-saved register portion of the
   /// stack frame in bytes.
   unsigned CalleeSavedFrameSize = 0;
@@ -120,6 +124,10 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   void setRestoreBasePointer(const MachineFunction *MF);
   int getRestoreBasePointerOffset() const {return RestoreBasePointerOffset; }
 
+  DenseMap<int, unsigned>& getWinEHXMMSlotInfo() { return WinEHXMMSlotInfo; }
+  const DenseMap<int, unsigned>& getWinEHXMMSlotInfo() const {
+    return WinEHXMMSlotInfo; }
+
   unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
   void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
 

diff  --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index 2e2f1f9e438a..c8966dfffa0c 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -692,12 +692,27 @@ static bool tryOptimizeLEAtoMOV(MachineBasicBlock::iterator II) {
   return true;
 }
 
+static bool isFuncletReturnInstr(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case X86::CATCHRET:
+  case X86::CLEANUPRET:
+    return true;
+  default:
+    return false;
+  }
+  llvm_unreachable("impossible");
+}
+
 void
 X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                      int SPAdj, unsigned FIOperandNum,
                                      RegScavenger *RS) const {
   MachineInstr &MI = *II;
-  MachineFunction &MF = *MI.getParent()->getParent();
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
+  bool IsEHFuncletEpilogue = MBBI == MBB.end() ? false
+                                               : isFuncletReturnInstr(*MBBI);
   const X86FrameLowering *TFI = getFrameLowering(MF);
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
 
@@ -709,6 +724,8 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
            MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) &&
            "Return instruction can only reference SP relative frame objects");
     FIOffset = TFI->getFrameIndexReferenceSP(MF, FrameIndex, BasePtr, 0);
+  } else if (TFI->Is64Bit && (MBB.isEHFuncletEntry() || IsEHFuncletEpilogue)) {
+    FIOffset = TFI->getWin64EHFrameIndexRef(MF, FrameIndex, BasePtr);
   } else {
     FIOffset = TFI->getFrameIndexReference(MF, FrameIndex, BasePtr);
   }

diff  --git a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
index defedd2a7f63..751d610c2ca7 100644
--- a/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
+++ b/llvm/test/CodeGen/X86/avx512-intel-ocl.ll
@@ -94,12 +94,12 @@ define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-64, %esp
 ; X32-NEXT:    subl $256, %esp ## imm = 0x100
-; X32-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%esp) ## 64-byte Spill
+; X32-NEXT:    vmovaps %zmm1, {{[-0-9]+}}(%e{{[sb]}}p) ## 64-byte Spill
 ; X32-NEXT:    vaddps %zmm1, %zmm0, %zmm0
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl %eax, (%esp)
 ; X32-NEXT:    calll _func_float16_ptr
-; X32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0 ## 64-byte Folded Reload
+; X32-NEXT:    vaddps {{[-0-9]+}}(%e{{[sb]}}p), %zmm0, %zmm0 ## 64-byte Folded Reload
 ; X32-NEXT:    vaddps {{[0-9]+}}(%esp), %zmm0, %zmm0
 ; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
@@ -184,110 +184,110 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
 ; WIN64-KNL-LABEL: test_prolog_epilog:
 ; WIN64-KNL:       # %bb.0:
 ; WIN64-KNL-NEXT:    pushq %rbp
-; WIN64-KNL-NEXT:    subq $1328, %rsp # imm = 0x530
+; WIN64-KNL-NEXT:    subq $1264, %rsp # imm = 0x4F0
 ; WIN64-KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
-; WIN64-KNL-NEXT:    kmovw %k7, 1198(%rbp) # 2-byte Spill
-; WIN64-KNL-NEXT:    kmovw %k6, 1196(%rbp) # 2-byte Spill
-; WIN64-KNL-NEXT:    kmovw %k5, 1194(%rbp) # 2-byte Spill
-; WIN64-KNL-NEXT:    kmovw %k4, 1192(%rbp) # 2-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm21, 1104(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm20, 992(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm19, 896(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm18, 832(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm17, 768(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm16, 704(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm15, 640(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm14, 576(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm13, 512(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm12, 448(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm11, 384(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm10, 320(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm9, 256(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm8, 192(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm7, 128(%rbp) # 64-byte Spill
-; WIN64-KNL-NEXT:    vmovaps %zmm6, 64(%rbp) # 64-byte Spill
+; WIN64-KNL-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; WIN64-KNL-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; WIN64-KNL-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; WIN64-KNL-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-KNL-NEXT:    vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; WIN64-KNL-NEXT:    andq $-64, %rsp
 ; WIN64-KNL-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%rsp)
 ; WIN64-KNL-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
 ; WIN64-KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN64-KNL-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN64-KNL-NEXT:    callq func_float16
-; WIN64-KNL-NEXT:    vmovaps 64(%rbp), %zmm6 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 128(%rbp), %zmm7 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 192(%rbp), %zmm8 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 256(%rbp), %zmm9 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 320(%rbp), %zmm10 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 384(%rbp), %zmm11 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 448(%rbp), %zmm12 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 512(%rbp), %zmm13 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 576(%rbp), %zmm14 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 640(%rbp), %zmm15 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 704(%rbp), %zmm16 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 768(%rbp), %zmm17 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 832(%rbp), %zmm18 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 896(%rbp), %zmm19 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 992(%rbp), %zmm20 # 64-byte Reload
-; WIN64-KNL-NEXT:    vmovaps 1104(%rbp), %zmm21 # 64-byte Reload
-; WIN64-KNL-NEXT:    kmovw 1192(%rbp), %k4 # 2-byte Reload
-; WIN64-KNL-NEXT:    kmovw 1194(%rbp), %k5 # 2-byte Reload
-; WIN64-KNL-NEXT:    kmovw 1196(%rbp), %k6 # 2-byte Reload
-; WIN64-KNL-NEXT:    kmovw 1198(%rbp), %k7 # 2-byte Reload
-; WIN64-KNL-NEXT:    leaq 1200(%rbp), %rsp
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; WIN64-KNL-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; WIN64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 2-byte Reload
+; WIN64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 2-byte Reload
+; WIN64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 2-byte Reload
+; WIN64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 2-byte Reload
+; WIN64-KNL-NEXT:    leaq 1136(%rbp), %rsp
 ; WIN64-KNL-NEXT:    popq %rbp
 ; WIN64-KNL-NEXT:    retq
 ;
 ; WIN64-SKX-LABEL: test_prolog_epilog:
 ; WIN64-SKX:       # %bb.0:
 ; WIN64-SKX-NEXT:    pushq %rbp
-; WIN64-SKX-NEXT:    subq $1328, %rsp # imm = 0x530
+; WIN64-SKX-NEXT:    subq $1264, %rsp # imm = 0x4F0
 ; WIN64-SKX-NEXT:    leaq {{[0-9]+}}(%rsp), %rbp
-; WIN64-SKX-NEXT:    kmovq %k7, 1192(%rbp) # 8-byte Spill
-; WIN64-SKX-NEXT:    kmovq %k6, 1184(%rbp) # 8-byte Spill
-; WIN64-SKX-NEXT:    kmovq %k5, 1176(%rbp) # 8-byte Spill
-; WIN64-SKX-NEXT:    kmovq %k4, 1168(%rbp) # 8-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm21, 1056(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm20, 960(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm19, 896(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm18, 832(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm17, 768(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm16, 704(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm15, 640(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm14, 576(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm13, 512(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm12, 448(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm11, 384(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm10, 320(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm9, 256(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm8, 192(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm7, 128(%rbp) # 64-byte Spill
-; WIN64-SKX-NEXT:    vmovaps %zmm6, 64(%rbp) # 64-byte Spill
+; WIN64-SKX-NEXT:    kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WIN64-SKX-NEXT:    kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WIN64-SKX-NEXT:    kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WIN64-SKX-NEXT:    kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
+; WIN64-SKX-NEXT:    vmovaps %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; WIN64-SKX-NEXT:    andq $-64, %rsp
 ; WIN64-SKX-NEXT:    vmovaps %zmm1, {{[0-9]+}}(%rsp)
 ; WIN64-SKX-NEXT:    vmovaps %zmm0, {{[0-9]+}}(%rsp)
 ; WIN64-SKX-NEXT:    leaq {{[0-9]+}}(%rsp), %rcx
 ; WIN64-SKX-NEXT:    leaq {{[0-9]+}}(%rsp), %rdx
 ; WIN64-SKX-NEXT:    callq func_float16
-; WIN64-SKX-NEXT:    vmovaps 64(%rbp), %zmm6 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 128(%rbp), %zmm7 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 192(%rbp), %zmm8 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 256(%rbp), %zmm9 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 320(%rbp), %zmm10 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 384(%rbp), %zmm11 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 448(%rbp), %zmm12 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 512(%rbp), %zmm13 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 576(%rbp), %zmm14 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 640(%rbp), %zmm15 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 704(%rbp), %zmm16 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 768(%rbp), %zmm17 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 832(%rbp), %zmm18 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 896(%rbp), %zmm19 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 960(%rbp), %zmm20 # 64-byte Reload
-; WIN64-SKX-NEXT:    vmovaps 1056(%rbp), %zmm21 # 64-byte Reload
-; WIN64-SKX-NEXT:    kmovq 1168(%rbp), %k4 # 8-byte Reload
-; WIN64-SKX-NEXT:    kmovq 1176(%rbp), %k5 # 8-byte Reload
-; WIN64-SKX-NEXT:    kmovq 1184(%rbp), %k6 # 8-byte Reload
-; WIN64-SKX-NEXT:    kmovq 1192(%rbp), %k7 # 8-byte Reload
-; WIN64-SKX-NEXT:    leaq 1200(%rbp), %rsp
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm7 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm12 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm13 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 # 64-byte Reload
+; WIN64-SKX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
+; WIN64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 # 8-byte Reload
+; WIN64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 # 8-byte Reload
+; WIN64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 # 8-byte Reload
+; WIN64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 8-byte Reload
+; WIN64-SKX-NEXT:    leaq 1136(%rbp), %rsp
 ; WIN64-SKX-NEXT:    popq %rbp
 ; WIN64-SKX-NEXT:    retq
 ;
@@ -296,47 +296,47 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
 ; X64-KNL-NEXT:    pushq %rsi
 ; X64-KNL-NEXT:    pushq %rdi
 ; X64-KNL-NEXT:    subq $1064, %rsp ## imm = 0x428
-; X64-KNL-NEXT:    kmovw %k7, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; X64-KNL-NEXT:    kmovw %k6, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; X64-KNL-NEXT:    kmovw %k5, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; X64-KNL-NEXT:    kmovw %k4, {{[0-9]+}}(%rsp) ## 2-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-KNL-NEXT:    vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill
+; X64-KNL-NEXT:    kmovw %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; X64-KNL-NEXT:    kmovw %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; X64-KNL-NEXT:    kmovw %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; X64-KNL-NEXT:    kmovw %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 2-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-KNL-NEXT:    vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
 ; X64-KNL-NEXT:    vmovups %zmm16, (%rsp) ## 64-byte Spill
 ; X64-KNL-NEXT:    callq _func_float16
 ; X64-KNL-NEXT:    vmovups (%rsp), %zmm16 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload
-; X64-KNL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload
-; X64-KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k4 ## 2-byte Reload
-; X64-KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k5 ## 2-byte Reload
-; X64-KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k6 ## 2-byte Reload
-; X64-KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k7 ## 2-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload
+; X64-KNL-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload
+; X64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 2-byte Reload
+; X64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 2-byte Reload
+; X64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 2-byte Reload
+; X64-KNL-NEXT:    kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 2-byte Reload
 ; X64-KNL-NEXT:    addq $1064, %rsp ## imm = 0x428
 ; X64-KNL-NEXT:    popq %rdi
 ; X64-KNL-NEXT:    popq %rsi
@@ -346,49 +346,49 @@ define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x fl
 ; X64-SKX:       ## %bb.0:
 ; X64-SKX-NEXT:    pushq %rsi
 ; X64-SKX-NEXT:    pushq %rdi
-; X64-SKX-NEXT:    subq $1192, %rsp ## imm = 0x4A8
-; X64-SKX-NEXT:    kmovq %k7, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; X64-SKX-NEXT:    kmovq %k6, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; X64-SKX-NEXT:    kmovq %k5, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; X64-SKX-NEXT:    kmovq %k4, {{[0-9]+}}(%rsp) ## 8-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm28, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm27, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm26, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm25, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm24, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm23, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm22, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm21, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm20, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm19, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm18, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; X64-SKX-NEXT:    vmovups %zmm17, {{[0-9]+}}(%rsp) ## 64-byte Spill
+; X64-SKX-NEXT:    subq $1064, %rsp ## imm = 0x428
+; X64-SKX-NEXT:    kmovq %k7, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-SKX-NEXT:    kmovq %k6, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-SKX-NEXT:    kmovq %k5, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-SKX-NEXT:    kmovq %k4, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm30, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm29, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm28, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm26, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm25, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm24, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm23, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm22, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm21, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm20, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm18, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
+; X64-SKX-NEXT:    vmovups %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) ## 64-byte Spill
 ; X64-SKX-NEXT:    vmovups %zmm16, (%rsp) ## 64-byte Spill
 ; X64-SKX-NEXT:    callq _func_float16
 ; X64-SKX-NEXT:    vmovups (%rsp), %zmm16 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm17 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm18 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm19 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm20 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm21 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm22 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm23 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm24 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm25 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm26 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm27 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm28 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm29 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload
-; X64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload
-; X64-SKX-NEXT:    kmovq {{[0-9]+}}(%rsp), %k4 ## 8-byte Reload
-; X64-SKX-NEXT:    kmovq {{[0-9]+}}(%rsp), %k5 ## 8-byte Reload
-; X64-SKX-NEXT:    kmovq {{[0-9]+}}(%rsp), %k6 ## 8-byte Reload
-; X64-SKX-NEXT:    kmovq {{[0-9]+}}(%rsp), %k7 ## 8-byte Reload
-; X64-SKX-NEXT:    addq $1192, %rsp ## imm = 0x4A8
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm17 ## 64-byte Reload
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 ## 64-byte Reload
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm19 ## 64-byte Reload
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm20 ## 64-byte Reload
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 ## 64-byte Reload
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm22 ## 64-byte Reload
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm23 ## 64-byte Reload
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 ## 64-byte Reload
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm25 ## 64-byte Reload
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm26 ## 64-byte Reload
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 ## 64-byte Reload
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 ## 64-byte Reload
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm29 ## 64-byte Reload
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm30 ## 64-byte Reload
+; X64-SKX-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm31 ## 64-byte Reload
+; X64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k4 ## 8-byte Reload
+; X64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k5 ## 8-byte Reload
+; X64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k6 ## 8-byte Reload
+; X64-SKX-NEXT:    kmovq {{[-0-9]+}}(%r{{[sb]}}p), %k7 ## 8-byte Reload
+; X64-SKX-NEXT:    addq $1064, %rsp ## imm = 0x428
 ; X64-SKX-NEXT:    popq %rdi
 ; X64-SKX-NEXT:    popq %rsi
 ; X64-SKX-NEXT:    retq

diff  --git a/llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll b/llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll
index 1160101792ff..e8bccdabdcd4 100644
--- a/llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll
+++ b/llvm/test/CodeGen/X86/catchpad-realign-savexmm.ll
@@ -51,3 +51,18 @@ catch:
 ; CHECK: popq    %rbp
 ; CHECK: retq
 ; CHECK: .seh_handlerdata
+; CHECK: # %catch
+; CHECK: movq    %rdx, 16(%rsp)
+; CHECK: pushq   %rbp
+; CHECK: .seh_pushreg 5
+; CHECK: subq    $48, %rsp
+; CHECK: .seh_stackalloc 48
+; CHECK: leaq    64(%rdx), %rbp
+; CHECK: movapd  %xmm6, 32(%rsp)
+; CHECK: .seh_savexmm 6, 32
+; CHECK: .seh_endprologue
+; CHECK: movapd  32(%rsp), %xmm6
+; CHECK: leaq    .LBB0_1(%rip), %rax
+; CHECK: addq    $48, %rsp
+; CHECK: popq    %rbp
+; CHECK: retq # CATCHRET

diff  --git a/llvm/test/CodeGen/X86/win64-funclet-savexmm.ll b/llvm/test/CodeGen/X86/win64-funclet-savexmm.ll
new file mode 100644
index 000000000000..62ddebb9a5a0
--- /dev/null
+++ b/llvm/test/CodeGen/X86/win64-funclet-savexmm.ll
@@ -0,0 +1,115 @@
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -mattr=+avx < %s | FileCheck %s
+
+; void bar(int a, int b, int c, int d, int e);
+; void baz(int x);
+; 
+; void foo(int a, int b, int c, int d, int e)
+; {
+;   __asm("nop" ::: "bx", "cx", "xmm5", "xmm6", "ymm7");
+;   try {
+;     bar(a, b, c, d, e);
+;   }
+;   catch (...) {
+;     baz(a);
+;     if (a)
+;       __asm("nop" ::: "xmm8");
+;   }
+; }
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+
+$"??_R0H at 8" = comdat any
+
+@"??_7type_info@@6B@" = external constant i8*
+@"??_R0H at 8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+
+declare dso_local i32 @__CxxFrameHandler3(...)
+declare dso_local void @"?bar@@YAXHHHHH at Z"(i32, i32, i32, i32, i32)
+declare dso_local void @"?baz@@YAXH at Z"(i32)
+
+define dso_local void @"?foo@@YAXHHHHH at Z"(i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %e.addr = alloca i32, align 4
+  %d.addr = alloca i32, align 4
+  %c.addr = alloca i32, align 4
+  %b.addr = alloca i32, align 4
+  %a.addr = alloca i32, align 4
+  store i32 %e, i32* %e.addr, align 4
+  store i32 %d, i32* %d.addr, align 4
+  store i32 %c, i32* %c.addr, align 4
+  store i32 %b, i32* %b.addr, align 4
+  store i32 %a, i32* %a.addr, align 4
+  call void asm sideeffect "nop", "~{bx},~{cx},~{xmm5},~{xmm6},~{ymm7}"()
+  %0 = load i32, i32* %e.addr, align 4
+  %1 = load i32, i32* %d.addr, align 4
+  %2 = load i32, i32* %c.addr, align 4
+  %3 = load i32, i32* %b.addr, align 4
+  %4 = load i32, i32* %a.addr, align 4
+  invoke void @"?bar@@YAXHHHHH at Z"(i32 %4, i32 %3, i32 %2, i32 %1, i32 %0)
+          to label %invoke.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %entry
+  %5 = catchswitch within none [label %catch] unwind to caller
+
+catch:                                            ; preds = %catch.dispatch
+  %6 = catchpad within %5 [i8* null, i32 64, i8* null]
+  %7 = load i32, i32* %a.addr, align 4
+  call void @"?baz@@YAXH at Z"(i32 %7) [ "funclet"(token %6) ]
+  %8 = load i32, i32* %a.addr, align 4
+  %tobool = icmp ne i32 %8, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %catch
+  call void asm sideeffect "nop", "~{xmm8}"() [ "funclet"(token %6) ]
+  br label %if.end
+
+invoke.cont:                                      ; preds = %entry
+  br label %try.cont
+
+if.end:                                           ; preds = %if.then, %catch
+  catchret from %6 to label %catchret.dest
+
+catchret.dest:                                    ; preds = %if.end
+  br label %try.cont
+
+try.cont:                                         ; preds = %catchret.dest, %invoke.cont
+  ret void
+}
+
+; CHECK: # %catch
+; CHECK: movq    %rdx, 16(%rsp)
+; CHECK: pushq   %rbp
+; CHECK: .seh_pushreg 5
+; CHECK: pushq   %rbx
+; CHECK: .seh_pushreg 3
+; CHECK: subq    $88, %rsp
+; CHECK: .seh_stackalloc 88
+; CHECK: leaq    112(%rdx), %rbp
+; CHECK: vmovaps %xmm8, 48(%rsp)
+; CHECK: .seh_savexmm 8, 48
+; CHECK: vmovaps %xmm7, 64(%rsp)
+; CHECK: .seh_savexmm 7, 64
+; CHECK: vmovaps %xmm6, 80(%rsp)
+; CHECK: .seh_savexmm 6, 80
+; CHECK: .seh_endprologue
+; CHECK: movl   -{{[0-9]+}}(%rbp), %ecx
+; CHECK: vmovaps 80(%rsp), %xmm6
+; CHECK: vmovaps 64(%rsp), %xmm7
+; CHECK: vmovaps 48(%rsp), %xmm8
+; CHECK: leaq    .LBB0_1(%rip), %rax
+; CHECK: addq    $88, %rsp
+; CHECK: popq    %rbx
+; CHECK: popq    %rbp
+; CHECK: retq # CATCHRET
+
+; CHECK-LABEL: "$handlerMap$0$?foo@@YAXHHHHH at Z":
+; CHECK-NEXT: .long   64                      # Adjectives
+; CHECK-NEXT: .long   0                       # Type
+; CHECK-NEXT: .long   0                       # CatchObjOffset
+; CHECK-NEXT: .long   "?catch$2@?0??foo@@YAXHHHHH at Z@4HA"@IMGREL # Handler
+; Sum of:
+;   16 RDX store offset
+;   16 two pushes
+;   72 stack alloc
+; CHECK-NEXT: .long   120                     # ParentFrameOffset
+

diff  --git a/llvm/test/CodeGen/X86/x86-interrupt_cc.ll b/llvm/test/CodeGen/X86/x86-interrupt_cc.ll
index 09f82b46c216..2043816f3a07 100644
--- a/llvm/test/CodeGen/X86/x86-interrupt_cc.ll
+++ b/llvm/test/CodeGen/X86/x86-interrupt_cc.ll
@@ -294,7 +294,7 @@ define x86_intrcc void @foo(i8* %frame) {
 ; CHECK64-SKX-NEXT:    kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
 ; CHECK64-SKX-NEXT:    ## encoding: [0xc4,0xe1,0xf8,0x91,0x84,0x24,0x30,0x08,0x00,0x00]
 ; CHECK64-SKX-NEXT:    vmovups %zmm31, {{[0-9]+}}(%rsp) ## 64-byte Spill
-; CHECK64-SKX-NEXT:    ## encoding: [0x62,0x61,0x7c,0x48,0x11,0xbc,0x24,0xe0,0x07,0x00,0x00]
+; CHECK64-SKX-NEXT:    ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x7c,0x24,0x1f]
 ; CHECK64-SKX-NEXT:    vmovups %zmm30, {{[0-9]+}}(%rsp) ## 64-byte Spill
 ; CHECK64-SKX-NEXT:    ## encoding: [0x62,0x61,0x7c,0x48,0x11,0x74,0x24,0x1e]
 ; CHECK64-SKX-NEXT:    vmovups %zmm29, {{[0-9]+}}(%rsp) ## 64-byte Spill
@@ -398,7 +398,7 @@ define x86_intrcc void @foo(i8* %frame) {
 ; CHECK64-SKX-NEXT:    .cfi_offset %xmm28, -448
 ; CHECK64-SKX-NEXT:    .cfi_offset %xmm29, -384
 ; CHECK64-SKX-NEXT:    .cfi_offset %xmm30, -320
-; CHECK64-SKX-NEXT:    .cfi_offset %xmm31, -224
+; CHECK64-SKX-NEXT:    .cfi_offset %xmm31, -256
 ; CHECK64-SKX-NEXT:    .cfi_offset %k0, -144
 ; CHECK64-SKX-NEXT:    .cfi_offset %k1, -136
 ; CHECK64-SKX-NEXT:    .cfi_offset %k2, -128
@@ -474,7 +474,7 @@ define x86_intrcc void @foo(i8* %frame) {
 ; CHECK64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm30 ## 64-byte Reload
 ; CHECK64-SKX-NEXT:    ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x74,0x24,0x1e]
 ; CHECK64-SKX-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm31 ## 64-byte Reload
-; CHECK64-SKX-NEXT:    ## encoding: [0x62,0x61,0x7c,0x48,0x10,0xbc,0x24,0xe0,0x07,0x00,0x00]
+; CHECK64-SKX-NEXT:    ## encoding: [0x62,0x61,0x7c,0x48,0x10,0x7c,0x24,0x1f]
 ; CHECK64-SKX-NEXT:    kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload
 ; CHECK64-SKX-NEXT:    ## encoding: [0xc4,0xe1,0xf8,0x90,0x84,0x24,0x30,0x08,0x00,0x00]
 ; CHECK64-SKX-NEXT:    kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload
@@ -635,7 +635,7 @@ define x86_intrcc void @foo(i8* %frame) {
 ; CHECK32-SKX-NEXT:    kmovq %k0, {{[0-9]+}}(%esp) ## 8-byte Spill
 ; CHECK32-SKX-NEXT:    ## encoding: [0xc4,0xe1,0xf8,0x91,0x84,0x24,0x30,0x02,0x00,0x00]
 ; CHECK32-SKX-NEXT:    vmovups %zmm7, {{[0-9]+}}(%esp) ## 64-byte Spill
-; CHECK32-SKX-NEXT:    ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0xbc,0x24,0xe0,0x01,0x00,0x00]
+; CHECK32-SKX-NEXT:    ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x7c,0x24,0x07]
 ; CHECK32-SKX-NEXT:    vmovups %zmm6, {{[0-9]+}}(%esp) ## 64-byte Spill
 ; CHECK32-SKX-NEXT:    ## encoding: [0x62,0xf1,0x7c,0x48,0x11,0x74,0x24,0x06]
 ; CHECK32-SKX-NEXT:    vmovups %zmm5, {{[0-9]+}}(%esp) ## 64-byte Spill
@@ -661,7 +661,7 @@ define x86_intrcc void @foo(i8* %frame) {
 ; CHECK32-SKX-NEXT:    .cfi_offset %xmm4, -384
 ; CHECK32-SKX-NEXT:    .cfi_offset %xmm5, -320
 ; CHECK32-SKX-NEXT:    .cfi_offset %xmm6, -256
-; CHECK32-SKX-NEXT:    .cfi_offset %xmm7, -160
+; CHECK32-SKX-NEXT:    .cfi_offset %xmm7, -192
 ; CHECK32-SKX-NEXT:    .cfi_offset %k0, -80
 ; CHECK32-SKX-NEXT:    .cfi_offset %k1, -72
 ; CHECK32-SKX-NEXT:    .cfi_offset %k2, -64
@@ -689,7 +689,7 @@ define x86_intrcc void @foo(i8* %frame) {
 ; CHECK32-SKX-NEXT:    vmovups {{[0-9]+}}(%esp), %zmm6 ## 64-byte Reload
 ; CHECK32-SKX-NEXT:    ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x74,0x24,0x06]
 ; CHECK32-SKX-NEXT:    vmovups {{[0-9]+}}(%esp), %zmm7 ## 64-byte Reload
-; CHECK32-SKX-NEXT:    ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0xbc,0x24,0xe0,0x01,0x00,0x00]
+; CHECK32-SKX-NEXT:    ## encoding: [0x62,0xf1,0x7c,0x48,0x10,0x7c,0x24,0x07]
 ; CHECK32-SKX-NEXT:    kmovq {{[0-9]+}}(%esp), %k0 ## 8-byte Reload
 ; CHECK32-SKX-NEXT:    ## encoding: [0xc4,0xe1,0xf8,0x90,0x84,0x24,0x30,0x02,0x00,0x00]
 ; CHECK32-SKX-NEXT:    kmovq {{[0-9]+}}(%esp), %k1 ## 8-byte Reload


        


More information about the llvm-branch-commits mailing list