[llvm] 0a950a2 - [SystemZ/z/OS] Implement save of non-volatile registers on z/OS XPLINK

Wed Oct 13 09:59:20 PDT 2021

Author: Kai Nacke
Date: 2021-10-13T12:57:57-04:00
New Revision: 0a950a2e94f2f5f1596a9c8af44d3bbd26497927

URL: https://github.com/llvm/llvm-project/commit/0a950a2e94f2f5f1596a9c8af44d3bbd26497927
DIFF: https://github.com/llvm/llvm-project/commit/0a950a2e94f2f5f1596a9c8af44d3bbd26497927.diff

LOG: [SystemZ/z/OS] Implement save of non-volatile registers on z/OS XPLINK

This PR implements the save of the XPLINK callee-saved registers
on z/OS.

Reviewed By: uweigand, Kai

Differential Revision: https://reviews.llvm.org/D111653

Added: 
    llvm/test/CodeGen/SystemZ/zos-prologue-epilog.ll

Modified: 
    llvm/lib/Target/SystemZ/SystemZCallingConv.td
    llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
    llvm/lib/Target/SystemZ/SystemZFrameLowering.h
    llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
    llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
    llvm/lib/Target/SystemZ/SystemZSubtarget.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.td b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
index 45e22b07be300..c606e78b69b6c 100644

--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.td
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.td
@@ -162,12 +162,14 @@ def CSR_SystemZ_NoRegs : CalleeSavedRegs<(add)>;
 //===----------------------------------------------------------------------===//
 // z/OS XPLINK64 callee-saved registers
 //===----------------------------------------------------------------------===//
-def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 8, 15),
-                                                (sequence "F%dD", 8, 15))>;
-
-def CSR_SystemZ_XPLINK64_Vector : CalleeSavedRegs<(add (sequence "R%dD", 8, 15),
-                                                       (sequence "F%dD", 15, 8),
-                                                       (sequence "V%d", 23, 16))>;
+// %R7D is volatile by the spec, but it must be saved in the prologue by
+// any non-leaf function and restored in the epilogue for use by the
+// return instruction so it functions exactly like a callee-saved register.
+def CSR_SystemZ_XPLINK64 : CalleeSavedRegs<(add (sequence "R%dD", 7, 15),
+                                                (sequence "F%dD", 15, 8))>;
+
+def CSR_SystemZ_XPLINK64_Vector : CalleeSavedRegs<(add CSR_SystemZ_XPLINK64,
+                                                   (sequence "V%d", 23, 16))>;
 
 //===----------------------------------------------------------------------===//
 // z/OS XPLINK64 return value calling convention

diff  --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 83f05e55226e2..d11d118fb8ee9 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -24,7 +24,7 @@ using namespace llvm;
 namespace {
 // The ABI-defined register save slots, relative to the CFA (i.e.
 // incoming stack pointer + SystemZMC::ELFCallFrameSize).
-static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
+static const TargetFrameLowering::SpillSlot ELFSpillOffsetTable[] = {
   { SystemZ::R2D,  0x10 },
   { SystemZ::R3D,  0x18 },
   { SystemZ::R4D,  0x20 },
@@ -44,6 +44,12 @@ static const TargetFrameLowering::SpillSlot SpillOffsetTable[] = {
   { SystemZ::F4D,  0x90 },
   { SystemZ::F6D,  0x98 }
 };
+
+static const TargetFrameLowering::SpillSlot XPLINKSpillOffsetTable[] = {
+    {SystemZ::R4D, 0x00},  {SystemZ::R5D, 0x08},  {SystemZ::R6D, 0x10},
+    {SystemZ::R7D, 0x18},  {SystemZ::R8D, 0x20},  {SystemZ::R9D, 0x28},
+    {SystemZ::R10D, 0x30}, {SystemZ::R11D, 0x38}, {SystemZ::R12D, 0x40},
+    {SystemZ::R13D, 0x48}, {SystemZ::R14D, 0x50}, {SystemZ::R15D, 0x58}};
 } // end anonymous namespace
 
 SystemZFrameLowering::SystemZFrameLowering(StackDirection D, Align StackAl,
@@ -201,8 +207,9 @@ void SystemZELFFrameLowering::determineCalleeSaves(MachineFunction &MF,
 
 SystemZELFFrameLowering::SystemZELFFrameLowering()
     : SystemZFrameLowering(TargetFrameLowering::StackGrowsDown, Align(8), 0,
-                           Align(8), false /* StackRealignable */),
+                           Align(8), /* StackRealignable */ false),
       RegSpillOffsets(0) {
+
   // Due to the SystemZ ABI, the DWARF CFA (Canonical Frame Address) is not
   // equal to the incoming stack pointer, but to incoming stack pointer plus
   // 160.  Instead of using a Local Area Offset, the Register save area will
@@ -212,8 +219,8 @@ SystemZELFFrameLowering::SystemZELFFrameLowering()
   // Create a mapping from register number to save slot offset.
   // These offsets are relative to the start of the register save area.
   RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
-  for (unsigned I = 0, E = array_lengthof(SpillOffsetTable); I != E; ++I)
-    RegSpillOffsets[SpillOffsetTable[I].Reg] = SpillOffsetTable[I].Offset;
+  for (unsigned I = 0, E = array_lengthof(ELFSpillOffsetTable); I != E; ++I)
+    RegSpillOffsets[ELFSpillOffsetTable[I].Reg] = ELFSpillOffsetTable[I].Offset;
 }
 
 // Add GPR64 to the save instruction being built by MIB, which is in basic
@@ -812,7 +819,176 @@ bool SystemZELFFrameLowering::usePackedStack(MachineFunction &MF) const {
 
 SystemZXPLINKFrameLowering::SystemZXPLINKFrameLowering()
     : SystemZFrameLowering(TargetFrameLowering::StackGrowsUp, Align(32), 128,
-                           Align(32), false /* StackRealignable */) {}
+                           Align(32), /* StackRealignable */ false),
+      RegSpillOffsets(-1) {
+
+  // Create a mapping from register number to save slot offset.
+  // These offsets are relative to the start of the local are area.
+  RegSpillOffsets.grow(SystemZ::NUM_TARGET_REGS);
+  for (unsigned I = 0, E = array_lengthof(XPLINKSpillOffsetTable); I != E; ++I)
+    RegSpillOffsets[XPLINKSpillOffsetTable[I].Reg] =
+        XPLINKSpillOffsetTable[I].Offset;
+}
+
+bool SystemZXPLINKFrameLowering::assignCalleeSavedSpillSlots(
+    MachineFunction &MF, const TargetRegisterInfo *TRI,
+    std::vector<CalleeSavedInfo> &CSI) const {
+  MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  SystemZMachineFunctionInfo *MFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
+
+  // Scan the call-saved GPRs and find the bounds of the register spill area.
+  unsigned LowGPR = 0;
+  int LowOffset = INT32_MAX;
+  unsigned HighGPR = LowGPR;
+  int HighOffset = -1;
+
+  unsigned RegSP = Regs.getStackPointerRegister();
+  auto &GRRegClass = SystemZ::GR64BitRegClass;
+  const unsigned RegSize = 8;
+
+  auto ProcessCSI = [&](std::vector<CalleeSavedInfo> &CSIList) {
+    for (auto &CS : CSIList) {
+      unsigned Reg = CS.getReg();
+      int Offset = RegSpillOffsets[Reg];
+      if (Offset >= 0) {
+        if (GRRegClass.contains(Reg)) {
+          if (LowOffset > Offset) {
+            LowOffset = Offset;
+            LowGPR = Reg;
+          }
+
+          if (Offset > HighOffset) {
+            HighOffset = Offset;
+            HighGPR = Reg;
+          }
+        }
+        int FrameIdx = MFFrame.CreateFixedSpillStackObject(RegSize, Offset);
+        CS.setFrameIdx(FrameIdx);
+      } else
+        CS.setFrameIdx(INT32_MAX);
+    }
+  };
+
+  std::vector<CalleeSavedInfo> Spills;
+
+  // For non-leaf functions:
+  // - the address of callee (entry point) register R6 must be saved
+  Spills.push_back(CalleeSavedInfo(Regs.getAddressOfCalleeRegister()));
+
+  // If the function needs a frame pointer, or if the backchain pointer should
+  // be stored, then save the stack pointer register R4.
+  if (hasFP(MF) || MF.getFunction().hasFnAttribute("backchain"))
+    Spills.push_back(CalleeSavedInfo(RegSP));
+
+  // Save the range of call-saved registers, for use by the
+  // prologue/epilogue inserters.
+  ProcessCSI(CSI);
+  MFI->setRestoreGPRRegs(LowGPR, HighGPR, LowOffset);
+
+  // Save the range of call-saved registers, for use by the epilogue inserter.
+  ProcessCSI(Spills);
+  MFI->setSpillGPRRegs(LowGPR, HighGPR, LowOffset);
+
+  // Create spill slots for the remaining registers.
+  for (auto &CS : CSI) {
+    if (CS.getFrameIdx() != INT32_MAX)
+      continue;
+    unsigned Reg = CS.getReg();
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    Align Alignment = TRI->getSpillAlign(*RC);
+    unsigned Size = TRI->getSpillSize(*RC);
+    Alignment = std::min(Alignment, getStackAlign());
+    int FrameIdx = MFFrame.CreateStackObject(Size, Alignment, true);
+    CS.setFrameIdx(FrameIdx);
+  }
+
+  return true;
+}
+
+void SystemZXPLINKFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                                      BitVector &SavedRegs,
+                                                      RegScavenger *RS) const {
+  TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
+
+  bool HasFP = hasFP(MF);
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
+
+  // If the function requires a frame pointer, record that the hard
+  // frame pointer will be clobbered.
+  if (HasFP)
+    SavedRegs.set(Regs.getFramePointerRegister());
+
+  // If the function is not an XPLeaf function, we need to save the
+  // return address register. We also always use that register for
+  // the return instruction, so it needs to be restored in the
+  // epilogue even though that register is considered to be volatile.
+  // #TODO: Implement leaf detection.
+  SavedRegs.set(Regs.getReturnFunctionAddressRegister());
+}
+
+bool SystemZXPLINKFrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return true;
+
+  MachineFunction &MF = *MBB.getParent();
+  SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  auto &Regs = Subtarget.getSpecialRegisters<SystemZXPLINK64Registers>();
+  SystemZ::GPRRegs SpillGPRs = ZFI->getSpillGPRRegs();
+  DebugLoc DL;
+
+  // Save GPRs
+  if (SpillGPRs.LowGPR) {
+    assert(SpillGPRs.LowGPR != SpillGPRs.HighGPR &&
+           "Should be saving multiple registers");
+
+    // Build an STM/STMG instruction.
+    MachineInstrBuilder MIB = BuildMI(MBB, MBBI, DL, TII->get(SystemZ::STMG));
+
+    // Add the explicit register operands.
+    addSavedGPR(MBB, MIB, SpillGPRs.LowGPR, false);
+    addSavedGPR(MBB, MIB, SpillGPRs.HighGPR, false);
+
+    // Add the address r4
+    MIB.addReg(Regs.getStackPointerRegister());
+
+    // Add the partial offset
+    // We cannot add the actual offset as, at the stack is not finalized
+    MIB.addImm(SpillGPRs.GPROffset);
+
+    // Make sure all call-saved GPRs are included as operands and are
+    // marked as live on entry.
+    auto &GRRegClass = SystemZ::GR64BitRegClass;
+    for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+      unsigned Reg = CSI[I].getReg();
+      if (GRRegClass.contains(Reg))
+        addSavedGPR(MBB, MIB, Reg, true);
+    }
+  }
+
+  // Spill FPRs to the stack in the normal TargetInstrInfo way
+  for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
+    unsigned Reg = CSI[I].getReg();
+    if (SystemZ::FP64BitRegClass.contains(Reg)) {
+      MBB.addLiveIn(Reg);
+      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
+                               &SystemZ::FP64BitRegClass, TRI);
+    }
+    if (SystemZ::VR128BitRegClass.contains(Reg)) {
+      MBB.addLiveIn(Reg);
+      TII->storeRegToStackSlot(MBB, MBBI, Reg, true, CSI[I].getFrameIdx(),
+                               &SystemZ::VR128BitRegClass, TRI);
+    }
+  }
+
+  return true;
+}
 
 void SystemZXPLINKFrameLowering::emitPrologue(MachineFunction &MF,
                                               MachineBasicBlock &MBB) const {}

diff  --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index 9bef45fdc120d..6fddb4f81c416 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -10,6 +10,8 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZFRAMELOWERING_H
 
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "SystemZInstrBuilder.h"
+#include "SystemZMachineFunctionInfo.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/Support/TypeSize.h"
@@ -19,7 +21,6 @@ class SystemZTargetMachine;
 class SystemZSubtarget;
 
 class SystemZFrameLowering : public TargetFrameLowering {
-
 public:
   SystemZFrameLowering(StackDirection D, Align StackAl, int LAO, Align TransAl,
                        bool StackReal);
@@ -86,9 +87,24 @@ class SystemZELFFrameLowering : public SystemZFrameLowering {
 };
 
 class SystemZXPLINKFrameLowering : public SystemZFrameLowering {
+  IndexedMap<unsigned> RegSpillOffsets;
+
 public:
   SystemZXPLINKFrameLowering();
 
+  bool
+  assignCalleeSavedSpillSlots(MachineFunction &MF,
+                              const TargetRegisterInfo *TRI,
+                              std::vector<CalleeSavedInfo> &CSI) const override;
+
+  void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
+                            RegScavenger *RS) const override;
+
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;

diff  --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 0062e39602f56..48cec176b0069 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -190,7 +190,9 @@ bool SystemZRegisterInfo::getRegAllocationHints(
 
 const MCPhysReg *
 SystemZXPLINK64Registers::getCalleeSavedRegs(const MachineFunction *MF) const {
-  return CSR_SystemZ_XPLINK64_SaveList;
+  const SystemZSubtarget &Subtarget = MF->getSubtarget<SystemZSubtarget>();
+  return Subtarget.hasVector() ? CSR_SystemZ_XPLINK64_Vector_SaveList
+                               : CSR_SystemZ_XPLINK64_SaveList;
 }
 
 const MCPhysReg *
@@ -211,7 +213,9 @@ SystemZELFRegisters::getCalleeSavedRegs(const MachineFunction *MF) const {
 const uint32_t *
 SystemZXPLINK64Registers::getCallPreservedMask(const MachineFunction &MF,
                                                CallingConv::ID CC) const {
-  return CSR_SystemZ_XPLINK64_RegMask;
+  const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
+  return Subtarget.hasVector() ? CSR_SystemZ_XPLINK64_Vector_RegMask
+                               : CSR_SystemZ_XPLINK64_RegMask;
 }
 
 const uint32_t *

diff  --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
index 122504d4b44b3..2a4253e2deafa 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -10,6 +10,7 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_SYSTEMZREGISTERINFO_H
 
 #include "SystemZ.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 
 #define GET_REGINFO_HEADER
@@ -44,9 +45,9 @@ inline bool isHighReg(unsigned int Reg) {
 /// It is abstract, all calling conventions must override and
 /// define the pure virtual member function defined in this class.
 class SystemZCallingConventionRegisters {
+
 public:
-  /// \returns the register that keeps the
-  /// return function address.
+  /// \returns the register that keeps the return function address.
   virtual int getReturnFunctionAddressRegister() = 0;
 
   /// \returns the register that keeps the
@@ -82,6 +83,8 @@ class SystemZXPLINK64Registers : public SystemZCallingConventionRegisters {
 
   int getFramePointerRegister() override final { return SystemZ::R8D; };
 
+  int getAddressOfCalleeRegister() { return SystemZ::R6D; };
+
   const MCPhysReg *
   getCalleeSavedRegs(const MachineFunction *MF) const override final;
 

diff  --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
index 0839ea0a60cb5..67c5b8eb09b69 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -96,6 +96,10 @@ class SystemZSubtarget : public SystemZGenSubtargetInfo {
     return SpecialRegisters.get();
   }
 
+  template <class SR> SR &getSpecialRegisters() const {
+    return *static_cast<SR *>(getSpecialRegisters());
+  }
+
   const TargetFrameLowering *getFrameLowering() const override {
     return FrameLowering.get();
   }

diff  --git a/llvm/test/CodeGen/SystemZ/zos-prologue-epilog.ll b/llvm/test/CodeGen/SystemZ/zos-prologue-epilog.ll
new file mode 100644
index 0000000000000..4934fee8410cc
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/zos-prologue-epilog.ll
@@ -0,0 +1,236 @@
+; Test the generated function prologs/epilogs under XPLINK64 on z/OS
+;
+; RUN: llc < %s -mtriple=s390x-ibm-zos -mcpu=z13 | FileCheck --check-prefixes=CHECK64,CHECK %s
+
+; Test prolog/epilog for non-XPLEAF.
+
+; Small stack frame.
+; CHECK-LABEL: func0
+; CHECK64: stmg  6, 7
+define void @func0() {
+  call i64 (i64) @fun(i64 10)
+  ret void
+}
+
+; Spill all GPR CSRs
+; CHECK-LABEL: func1
+; CHECK64: stmg 6, 15
+define void @func1(i64 *%ptr) {
+  %l01 = load volatile i64, i64 *%ptr
+  %l02 = load volatile i64, i64 *%ptr
+  %l03 = load volatile i64, i64 *%ptr
+  %l04 = load volatile i64, i64 *%ptr
+  %l05 = load volatile i64, i64 *%ptr
+  %l06 = load volatile i64, i64 *%ptr
+  %l07 = load volatile i64, i64 *%ptr
+  %l08 = load volatile i64, i64 *%ptr
+  %l09 = load volatile i64, i64 *%ptr
+  %l10 = load volatile i64, i64 *%ptr
+  %l11 = load volatile i64, i64 *%ptr
+  %l12 = load volatile i64, i64 *%ptr
+  %l13 = load volatile i64, i64 *%ptr
+  %l14 = load volatile i64, i64 *%ptr
+  %l15 = load volatile i64, i64 *%ptr
+  %add01 = add i64 %l01, %l01
+  %add02 = add i64 %l02, %add01
+  %add03 = add i64 %l03, %add02
+  %add04 = add i64 %l04, %add03
+  %add05 = add i64 %l05, %add04
+  %add06 = add i64 %l06, %add05
+  %add07 = add i64 %l07, %add06
+  %add08 = add i64 %l08, %add07
+  %add09 = add i64 %l09, %add08
+  %add10 = add i64 %l10, %add09
+  %add11 = add i64 %l11, %add10
+  %add12 = add i64 %l12, %add11
+  %add13 = add i64 %l13, %add12
+  %add14 = add i64 %l14, %add13
+  %add15 = add i64 %l15, %add14
+  store volatile i64 %add01, i64 *%ptr
+  store volatile i64 %add02, i64 *%ptr
+  store volatile i64 %add03, i64 *%ptr
+  store volatile i64 %add04, i64 *%ptr
+  store volatile i64 %add05, i64 *%ptr
+  store volatile i64 %add06, i64 *%ptr
+  store volatile i64 %add07, i64 *%ptr
+  store volatile i64 %add08, i64 *%ptr
+  store volatile i64 %add09, i64 *%ptr
+  store volatile i64 %add10, i64 *%ptr
+  store volatile i64 %add11, i64 *%ptr
+  store volatile i64 %add12, i64 *%ptr
+  store volatile i64 %add13, i64 *%ptr
+  store volatile i64 %add14, i64 *%ptr
+  store volatile i64 %add15, i64 *%ptr
+  ret void
+}
+
+
+; Spill all FPRs and VRs
+; CHECK-LABEL: func2
+; CHECK64: std	15, {{[0-9]+}}(4)                      * 8-byte Folded Spill
+; CHECK64: std	14, {{[0-9]+}}(4)                      * 8-byte Folded Spill
+; CHECK64: std	13, {{[0-9]+}}(4)                      * 8-byte Folded Spill
+; CHECK64: std	12, {{[0-9]+}}(4)                      * 8-byte Folded Spill
+; CHECK64: std	11, {{[0-9]+}}(4)                      * 8-byte Folded Spill
+; CHECK64: std	10, {{[0-9]+}}(4)                      * 8-byte Folded Spill
+; CHECK64: std	9, {{[0-9]+}}(4)                       * 8-byte Folded Spill
+; CHECK64: std	8, {{[0-9]+}}(4)                       * 8-byte Folded Spill
+; CHECK64: vst	23, {{[0-9]+}}(4), 4                   * 16-byte Folded Spill
+; CHECK64: vst	22, {{[0-9]+}}(4), 4                   * 16-byte Folded Spill
+; CHECK64: vst	21, {{[0-9]+}}(4), 4                   * 16-byte Folded Spill
+; CHECK64: vst	20, {{[0-9]+}}(4), 4                   * 16-byte Folded Spill
+; CHECK64: vst	19, {{[0-9]+}}(4), 4                   * 16-byte Folded Spill
+; CHECK64: vst	18, {{[0-9]+}}(4), 4                   * 16-byte Folded Spill
+; CHECK64: vst	17, {{[0-9]+}}(4), 4                   * 16-byte Folded Spill
+; CHECK64: vst	16, {{[0-9]+}}(4), 4                   * 16-byte Folded Spill
+define void @func2(double *%ptr, <2 x i64> *%vec_ptr) {
+  %l00 = load volatile double, double *%ptr
+  %l01 = load volatile double, double *%ptr
+  %l02 = load volatile double, double *%ptr
+  %l03 = load volatile double, double *%ptr
+  %l04 = load volatile double, double *%ptr
+  %l05 = load volatile double, double *%ptr
+  %l06 = load volatile double, double *%ptr
+  %l07 = load volatile double, double *%ptr
+  %l08 = load volatile double, double *%ptr
+  %l09 = load volatile double, double *%ptr
+  %l10 = load volatile double, double *%ptr
+  %l11 = load volatile double, double *%ptr
+  %l12 = load volatile double, double *%ptr
+  %l13 = load volatile double, double *%ptr
+  %l14 = load volatile double, double *%ptr
+  %l15 = load volatile double, double *%ptr
+  %add00 = fadd double %l01, %l00
+  %add01 = fadd double %l01, %add00
+  %add02 = fadd double %l02, %add01
+  %add03 = fadd double %l03, %add02
+  %add04 = fadd double %l04, %add03
+  %add05 = fadd double %l05, %add04
+  %add06 = fadd double %l06, %add05
+  %add07 = fadd double %l07, %add06
+  %add08 = fadd double %l08, %add07
+  %add09 = fadd double %l09, %add08
+  %add10 = fadd double %l10, %add09
+  %add11 = fadd double %l11, %add10
+  %add12 = fadd double %l12, %add11
+  %add13 = fadd double %l13, %add12
+  %add14 = fadd double %l14, %add13
+  %add15 = fadd double %l15, %add14
+  store volatile double %add00, double *%ptr
+  store volatile double %add01, double *%ptr
+  store volatile double %add02, double *%ptr
+  store volatile double %add03, double *%ptr
+  store volatile double %add04, double *%ptr
+  store volatile double %add05, double *%ptr
+  store volatile double %add06, double *%ptr
+  store volatile double %add07, double *%ptr
+  store volatile double %add08, double *%ptr
+  store volatile double %add09, double *%ptr
+  store volatile double %add10, double *%ptr
+  store volatile double %add11, double *%ptr
+  store volatile double %add12, double *%ptr
+  store volatile double %add13, double *%ptr
+  store volatile double %add14, double *%ptr
+  store volatile double %add15, double *%ptr
+
+  %v00 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v01 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v02 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v03 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v04 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v05 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v06 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v07 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v08 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v09 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v10 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v11 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v12 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v13 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v14 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v15 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v16 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v17 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v18 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v19 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v20 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v21 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v22 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v23 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v24 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v25 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v26 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v27 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v28 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v29 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v30 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %v31 = load volatile <2 x i64>, <2 x i64> *%vec_ptr
+  %vadd00 = add <2 x i64> %v00, %v00
+  %vadd01 = add <2 x i64> %v01, %vadd00
+  %vadd02 = add <2 x i64> %v02, %vadd01
+  %vadd03 = add <2 x i64> %v03, %vadd02
+  %vadd04 = add <2 x i64> %v04, %vadd03
+  %vadd05 = add <2 x i64> %v05, %vadd04
+  %vadd06 = add <2 x i64> %v06, %vadd05
+  %vadd07 = add <2 x i64> %v07, %vadd06
+  %vadd08 = add <2 x i64> %v08, %vadd07
+  %vadd09 = add <2 x i64> %v09, %vadd08
+  %vadd10 = add <2 x i64> %v10, %vadd09
+  %vadd11 = add <2 x i64> %v11, %vadd10
+  %vadd12 = add <2 x i64> %v12, %vadd11
+  %vadd13 = add <2 x i64> %v13, %vadd12
+  %vadd14 = add <2 x i64> %v14, %vadd13
+  %vadd15 = add <2 x i64> %v15, %vadd14
+  %vadd16 = add <2 x i64> %v16, %vadd15
+  %vadd17 = add <2 x i64> %v17, %vadd16
+  %vadd18 = add <2 x i64> %v18, %vadd17
+  %vadd19 = add <2 x i64> %v19, %vadd18
+  %vadd20 = add <2 x i64> %v20, %vadd19
+  %vadd21 = add <2 x i64> %v21, %vadd20
+  %vadd22 = add <2 x i64> %v22, %vadd21
+  %vadd23 = add <2 x i64> %v23, %vadd22
+  %vadd24 = add <2 x i64> %v24, %vadd23
+  %vadd25 = add <2 x i64> %v25, %vadd24
+  %vadd26 = add <2 x i64> %v26, %vadd25
+  %vadd27 = add <2 x i64> %v27, %vadd26
+  %vadd28 = add <2 x i64> %v28, %vadd27
+  %vadd29 = add <2 x i64> %v29, %vadd28
+  %vadd30 = add <2 x i64> %v30, %vadd29
+  %vadd31 = add <2 x i64> %v31, %vadd30
+  store volatile <2 x i64> %vadd00, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd01, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd02, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd03, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd04, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd05, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd06, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd07, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd08, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd09, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd10, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd11, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd12, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd13, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd14, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd15, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd16, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd17, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd18, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd19, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd20, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd21, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd22, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd23, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd24, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd25, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd26, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd27, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd28, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd29, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd30, <2 x i64> *%vec_ptr
+  store volatile <2 x i64> %vadd31, <2 x i64> *%vec_ptr
+  ret void
+}
+
+declare i64 @fun(i64 %arg0)
+