[llvm-branch-commits] [llvm] [AMDGPU] Implement -amdgpu-spill-cfi-saved-regs (PR #164725)

Scott Linder via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Nov 24 10:18:30 PST 2025


https://github.com/slinder1 updated https://github.com/llvm/llvm-project/pull/164725

>From 9875b295293783fd6bb216ee71515fa9246a6d0a Mon Sep 17 00:00:00 2001
From: Emma Pilkington <Emma.Pilkington at amd.com>
Date: Thu, 19 Jun 2025 11:01:51 -0400
Subject: [PATCH] [AMDGPU] Implement -amdgpu-spill-cfi-saved-regs

These spills need special CFI anyway, so implementing them directly
where CFI is emitted avoids the need to invent a mechanism to track them
from ISel.

Co-authored-by: Scott Linder <scott.linder at amd.com>
Co-authored-by: Venkata Ramanaiah Nalamothu <VenkataRamanaiah.Nalamothu at amd.com>
---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |   45 +-
 llvm/lib/Target/AMDGPU/SIFrameLowering.h      |    7 +
 llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp  |    2 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |    3 +-
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |   13 +-
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |    9 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |    2 +
 .../AMDGPU/amdgpu-spill-cfi-saved-regs.ll     | 2556 +++++++++++++++++
 8 files changed, 2623 insertions(+), 14 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index eb8fd0ac34b24..04b5f483953fa 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -663,12 +663,21 @@ void SIFrameLowering::emitEntryFunctionFlatScratchInit(
 }
 
 // Note SGPRSpill stack IDs should only be used for SGPR spilling to VGPRs, not
-// memory. They should have been removed by now.
-static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
+// memory. They should have been removed by now, except CFI Saved Reg spills.
+static bool allStackObjectsAreDead(const MachineFunction &MF) {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
        I != E; ++I) {
-    if (!MFI.isDeadObjectIndex(I))
+    if (!MFI.isDeadObjectIndex(I)) {
+      // determineCalleeSaves() might have added the SGPRSpill stack IDs for
+      // CFI saves into scratch VGPR, ignore them
+      if (MFI.getStackID(I) == TargetStackID::SGPRSpill &&
+          FuncInfo->checkIndexInPrologEpilogSGPRSpills(I)) {
+        continue;
+      }
       return false;
+    }
   }
 
   return true;
@@ -688,8 +697,8 @@ Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
 
   Register ScratchRsrcReg = MFI->getScratchRSrcReg();
 
-  if (!ScratchRsrcReg || (!MRI.isPhysRegUsed(ScratchRsrcReg) &&
-                          allStackObjectsAreDead(MF.getFrameInfo())))
+  if (!ScratchRsrcReg ||
+      (!MRI.isPhysRegUsed(ScratchRsrcReg) && allStackObjectsAreDead(MF)))
     return Register();
 
   if (ST.hasSGPRInitBug() ||
@@ -916,7 +925,7 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
   bool NeedsFlatScratchInit =
       MFI->getUserSGPRInfo().hasFlatScratchInit() &&
       (MRI.isPhysRegUsed(AMDGPU::FLAT_SCR) || FrameInfo.hasCalls() ||
-       (!allStackObjectsAreDead(FrameInfo) && ST.enableFlatScratch()));
+       (!allStackObjectsAreDead(MF) && ST.enableFlatScratch()));
 
   if ((NeedsFlatScratchInit || ScratchRsrcReg) &&
       PreloadedScratchWaveOffsetReg && !ST.flatScratchIsArchitected()) {
@@ -1309,6 +1318,11 @@ void SIFrameLowering::emitCSRSpillStores(MachineFunction &MF,
         LiveUnits.addReg(Reg);
     }
   }
+
+  // Remove the spill entry created for EXEC. It is needed only for CFISaves in
+  // the prologue.
+  if (TRI.isCFISavedRegsSpillEnabled())
+    FuncInfo->removePrologEpilogSGPRSpillEntry(TRI.getExec());
 }
 
 void SIFrameLowering::emitCSRSpillRestores(
@@ -1790,14 +1804,14 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   // can. Any remaining SGPR spills will go to memory, so move them back to the
   // default stack.
   bool HaveSGPRToVMemSpill =
-      FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ true);
+      FuncInfo->removeDeadFrameIndices(MF, /*ResetSGPRSpillStackIDs*/ true);
   assert(allSGPRSpillsAreDead(MF) &&
          "SGPR spill should have been removed in SILowerSGPRSpills");
 
   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
   // but currently hasNonSpillStackObjects is set only from source
   // allocas. Stack temps produced from legalization are not counted currently.
-  if (!allStackObjectsAreDead(MFI)) {
+  if (!allStackObjectsAreDead(MF)) {
     assert(RS && "RegScavenger required if spilling");
 
     // Add an emergency spill slot
@@ -1897,6 +1911,18 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
     MFI->setSGPRForEXECCopy(AMDGPU::NoRegister);
   }
 
+  if (TRI->isCFISavedRegsSpillEnabled()) {
+    Register Exec = TRI->getExec();
+    assert(!MFI->hasPrologEpilogSGPRSpillEntry(Exec) &&
+           "Re-reserving spill slot for EXEC");
+    // FIXME: Machine Copy Propagation currently optimizes away the EXEC copy to
+    // the scratch as we emit it only in the prolog. This optimization should
+    // not happen for frame related instructions. Until this is fixed ignore
+    // copy to scratch SGPR.
+    getVGPRSpillLaneOrTempRegister(MF, LiveUnits, Exec, RC,
+                                   /*IncludeScratchCopy=*/false);
+  }
+
   // hasFP only knows about stack objects that already exist. We're now
   // determining the stack slots that will be created, so we have to predict
   // them. Stack objects force FP usage with calls.
@@ -1906,8 +1932,7 @@ void SIFrameLowering::determinePrologEpilogSGPRSaves(
   //
   // FIXME: Is this really hasReservedCallFrame?
   const bool WillHaveFP =
-      FrameInfo.hasCalls() &&
-      (SavedVGPRs.any() || !allStackObjectsAreDead(FrameInfo));
+      FrameInfo.hasCalls() && (SavedVGPRs.any() || !allStackObjectsAreDead(MF));
 
   if (WillHaveFP || hasFP(MF)) {
     Register FramePtrReg = MFI->getFrameOffsetReg();
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index 2b716db0b7a22..526404eb83b4f 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -114,6 +114,13 @@ class SIFrameLowering final : public AMDGPUFrameLowering {
 public:
   bool requiresStackPointerReference(const MachineFunction &MF) const;
 
+  /// If '-amdgpu-spill-cfi-saved-regs' is enabled, emit RA/EXEC spills to
+  /// a free VGPR (lanes) or memory and corresponding CFI rules.
+  void emitCFISavedRegSpills(MachineFunction &MF, MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI,
+                             LiveRegUnits &LiveRegs,
+                             bool emitSpillsToMem) const;
+
   /// Create a CFI index for CFIInst and build a MachineInstr around it.
   MachineInstr *
   buildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 62386da94d854..57ff52334a470 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -531,7 +531,7 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) {
     // free frame index ids by the later pass(es) like "stack slot coloring"
     // which in turn could mess-up with the book keeping of "frame index to VGPR
     // lane".
-    FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false);
+    FuncInfo->removeDeadFrameIndices(MF, /*ResetSGPRSpillStackIDs*/ false);
 
     MadeChange = true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 9abda275d7e42..770398f428417 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -566,7 +566,8 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
 }
 
 bool SIMachineFunctionInfo::removeDeadFrameIndices(
-    MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) {
+    MachineFunction &MF, bool ResetSGPRSpillStackIDs) {
+  MachineFrameInfo &MFI = MF.getFrameInfo();
   // Remove dead frame indices from function frame, however keep FP & BP since
   // spills for them haven't been inserted yet. And also make sure to remove the
   // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure,
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index d901f4c216551..e3df712fc07c1 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -757,6 +757,16 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
                    }) != PrologEpilogSGPRSpills.end();
   }
 
+  // Remove if an entry created for \p Reg.
+  void removePrologEpilogSGPRSpillEntry(Register Reg) {
+    auto I = find_if(PrologEpilogSGPRSpills,
+                     [&Reg](const auto &Spill) { return Spill.first == Reg; });
+    if (I == PrologEpilogSGPRSpills.end())
+      return;
+
+    PrologEpilogSGPRSpills.erase(I);
+  }
+
   const PrologEpilogSGPRSaveRestoreInfo &
   getPrologEpilogSGPRSaveRestoreInfo(Register Reg) const {
     const auto *I = find_if(PrologEpilogSGPRSpills, [&Reg](const auto &Spill) {
@@ -835,8 +845,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
 
   /// If \p ResetSGPRSpillStackIDs is true, reset the stack ID from sgpr-spill
   /// to the default stack.
-  bool removeDeadFrameIndices(MachineFrameInfo &MFI,
-                              bool ResetSGPRSpillStackIDs);
+  bool removeDeadFrameIndices(MachineFunction &MF, bool ResetSGPRSpillStackIDs);
 
   int getScavengeFI(MachineFrameInfo &MFI, const SIRegisterInfo &TRI);
   std::optional<int> getOptionalScavengeFI() const { return ScavengeFI; }
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 1e3062e974408..8e1f5a8657752 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -35,6 +35,11 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
   cl::ReallyHidden,
   cl::init(true));
 
+static cl::opt<bool> EnableSpillCFISavedRegs(
+    "amdgpu-spill-cfi-saved-regs",
+    cl::desc("Enable spilling the registers required for CFI emission"),
+    cl::ReallyHidden, cl::init(false), cl::ZeroOrMore);
+
 std::array<std::vector<int16_t>, 32> SIRegisterInfo::RegSplitParts;
 std::array<std::array<uint16_t, 32>, 9> SIRegisterInfo::SubRegFromChannelTable;
 
@@ -561,6 +566,10 @@ unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
   return SubRegFromChannelTable[NumRegIndex - 1][Channel];
 }
 
+bool SIRegisterInfo::isCFISavedRegsSpillEnabled() const {
+  return EnableSpillCFISavedRegs;
+}
+
 MCRegister
 SIRegisterInfo::getAlignedHighSGPRForRC(const MachineFunction &MF,
                                         const unsigned Align,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index b45b08cce78ad..303eb801550b6 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -80,6 +80,8 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
     return SpillSGPRToVGPR;
   }
 
+  bool isCFISavedRegsSpillEnabled() const;
+
   /// Return the largest available SGPR aligned to \p Align for the register
   /// class \p RC.
   MCRegister getAlignedHighSGPRForRC(const MachineFunction &MF,
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll
new file mode 100644
index 0000000000000..c804c75ae7d2c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-spill-cfi-saved-regs.ll
@@ -0,0 +1,2556 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-spill-cfi-saved-regs -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,WAVE64 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-spill-cfi-saved-regs -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,WAVE32 %s
+
+define protected amdgpu_kernel void @kern() #0 {
+; CHECK-LABEL: kern:
+; CHECK:       .Lfunc_begin0:
+; CHECK-NEXT:    .cfi_sections .debug_frame
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  ; %bb.0: ; %entry
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x04, 0x30, 0x36, 0xe9, 0x02 ;
+; CHECK-NEXT:    .cfi_undefined 16
+; CHECK-NEXT:    s_endpgm
+entry:
+  ret void
+}
+
+define hidden void @func_saved_in_clobbered_vgpr() #0 {
+; WAVE64-LABEL: func_saved_in_clobbered_vgpr:
+; WAVE64:       .Lfunc_begin1:
+; WAVE64-NEXT:    .cfi_startproc
+; WAVE64-NEXT:  ; %bb.0: ; %entry
+; WAVE64-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; WAVE64-NEXT:    .cfi_llvm_register_pair 16, 62, 32, 63, 32
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE64-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; WAVE64-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_offset 2560, 0
+; WAVE64-NEXT:    s_mov_b64 exec, s[4:5]
+; WAVE64-NEXT:    v_writelane_b32 v0, exec_lo, 0
+; WAVE64-NEXT:    v_writelane_b32 v0, exec_hi, 1
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 17, 2560, 0, 32, 2560, 1, 32
+; WAVE64-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; WAVE64-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; WAVE64-NEXT:    s_mov_b64 exec, s[4:5]
+; WAVE64-NEXT:    s_waitcnt vmcnt(0)
+; WAVE64-NEXT:    s_setpc_b64 s[30:31]
+;
+; WAVE32-LABEL: func_saved_in_clobbered_vgpr:
+; WAVE32:       .Lfunc_begin1:
+; WAVE32-NEXT:    .cfi_startproc
+; WAVE32-NEXT:  ; %bb.0: ; %entry
+; WAVE32-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; WAVE32-NEXT:    .cfi_llvm_register_pair 16, 62, 32, 63, 32
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE32-NEXT:    s_xor_saveexec_b32 s4, -1
+; WAVE32-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_offset 1536, 0
+; WAVE32-NEXT:    s_waitcnt_depctr 0xffe3
+; WAVE32-NEXT:    s_mov_b32 exec_lo, s4
+; WAVE32-NEXT:    v_writelane_b32 v0, exec_lo, 0
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1, 1536, 0, 32
+; WAVE32-NEXT:    s_xor_saveexec_b32 s4, -1
+; WAVE32-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; WAVE32-NEXT:    s_waitcnt_depctr 0xffe3
+; WAVE32-NEXT:    s_mov_b32 exec_lo, s4
+; WAVE32-NEXT:    s_waitcnt vmcnt(0)
+; WAVE32-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  ret void
+}
+
+; Check that the option causes a CSR VGPR to spill when needed.
+define hidden void @func_saved_in_preserved_vgpr() #0 {
+; WAVE64-LABEL: func_saved_in_preserved_vgpr:
+; WAVE64:       .Lfunc_begin2:
+; WAVE64-NEXT:    .cfi_startproc
+; WAVE64-NEXT:  ; %bb.0: ; %entry
+; WAVE64-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; WAVE64-NEXT:    .cfi_llvm_register_pair 16, 62, 32, 63, 32
+; WAVE64-NEXT:    .cfi_undefined 2560
+; WAVE64-NEXT:    .cfi_undefined 2561
+; WAVE64-NEXT:    .cfi_undefined 2562
+; WAVE64-NEXT:    .cfi_undefined 2563
+; WAVE64-NEXT:    .cfi_undefined 2564
+; WAVE64-NEXT:    .cfi_undefined 2565
+; WAVE64-NEXT:    .cfi_undefined 2566
+; WAVE64-NEXT:    .cfi_undefined 2567
+; WAVE64-NEXT:    .cfi_undefined 2568
+; WAVE64-NEXT:    .cfi_undefined 2569
+; WAVE64-NEXT:    .cfi_undefined 2570
+; WAVE64-NEXT:    .cfi_undefined 2571
+; WAVE64-NEXT:    .cfi_undefined 2572
+; WAVE64-NEXT:    .cfi_undefined 2573
+; WAVE64-NEXT:    .cfi_undefined 2574
+; WAVE64-NEXT:    .cfi_undefined 2575
+; WAVE64-NEXT:    .cfi_undefined 2576
+; WAVE64-NEXT:    .cfi_undefined 2577
+; WAVE64-NEXT:    .cfi_undefined 2578
+; WAVE64-NEXT:    .cfi_undefined 2579
+; WAVE64-NEXT:    .cfi_undefined 2580
+; WAVE64-NEXT:    .cfi_undefined 2581
+; WAVE64-NEXT:    .cfi_undefined 2582
+; WAVE64-NEXT:    .cfi_undefined 2583
+; WAVE64-NEXT:    .cfi_undefined 2584
+; WAVE64-NEXT:    .cfi_undefined 2585
+; WAVE64-NEXT:    .cfi_undefined 2586
+; WAVE64-NEXT:    .cfi_undefined 2587
+; WAVE64-NEXT:    .cfi_undefined 2588
+; WAVE64-NEXT:    .cfi_undefined 2589
+; WAVE64-NEXT:    .cfi_undefined 2590
+; WAVE64-NEXT:    .cfi_undefined 2591
+; WAVE64-NEXT:    .cfi_undefined 2592
+; WAVE64-NEXT:    .cfi_undefined 2593
+; WAVE64-NEXT:    .cfi_undefined 2594
+; WAVE64-NEXT:    .cfi_undefined 2595
+; WAVE64-NEXT:    .cfi_undefined 2596
+; WAVE64-NEXT:    .cfi_undefined 2597
+; WAVE64-NEXT:    .cfi_undefined 2598
+; WAVE64-NEXT:    .cfi_undefined 2599
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE64-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; WAVE64-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_offset 2600, 0
+; WAVE64-NEXT:    s_mov_b64 exec, s[4:5]
+; WAVE64-NEXT:    v_writelane_b32 v40, exec_lo, 0
+; WAVE64-NEXT:    v_writelane_b32 v40, exec_hi, 1
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 17, 2600, 0, 32, 2600, 1, 32
+; WAVE64-NEXT:    ;;#ASMSTART
+; WAVE64-NEXT:    ; clobber nonpreserved VGPRs
+; WAVE64-NEXT:    ;;#ASMEND
+; WAVE64-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; WAVE64-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; WAVE64-NEXT:    s_mov_b64 exec, s[4:5]
+; WAVE64-NEXT:    s_waitcnt vmcnt(0)
+; WAVE64-NEXT:    s_setpc_b64 s[30:31]
+;
+; WAVE32-LABEL: func_saved_in_preserved_vgpr:
+; WAVE32:       .Lfunc_begin2:
+; WAVE32-NEXT:    .cfi_startproc
+; WAVE32-NEXT:  ; %bb.0: ; %entry
+; WAVE32-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; WAVE32-NEXT:    .cfi_llvm_register_pair 16, 62, 32, 63, 32
+; WAVE32-NEXT:    .cfi_undefined 1536
+; WAVE32-NEXT:    .cfi_undefined 1537
+; WAVE32-NEXT:    .cfi_undefined 1538
+; WAVE32-NEXT:    .cfi_undefined 1539
+; WAVE32-NEXT:    .cfi_undefined 1540
+; WAVE32-NEXT:    .cfi_undefined 1541
+; WAVE32-NEXT:    .cfi_undefined 1542
+; WAVE32-NEXT:    .cfi_undefined 1543
+; WAVE32-NEXT:    .cfi_undefined 1544
+; WAVE32-NEXT:    .cfi_undefined 1545
+; WAVE32-NEXT:    .cfi_undefined 1546
+; WAVE32-NEXT:    .cfi_undefined 1547
+; WAVE32-NEXT:    .cfi_undefined 1548
+; WAVE32-NEXT:    .cfi_undefined 1549
+; WAVE32-NEXT:    .cfi_undefined 1550
+; WAVE32-NEXT:    .cfi_undefined 1551
+; WAVE32-NEXT:    .cfi_undefined 1552
+; WAVE32-NEXT:    .cfi_undefined 1553
+; WAVE32-NEXT:    .cfi_undefined 1554
+; WAVE32-NEXT:    .cfi_undefined 1555
+; WAVE32-NEXT:    .cfi_undefined 1556
+; WAVE32-NEXT:    .cfi_undefined 1557
+; WAVE32-NEXT:    .cfi_undefined 1558
+; WAVE32-NEXT:    .cfi_undefined 1559
+; WAVE32-NEXT:    .cfi_undefined 1560
+; WAVE32-NEXT:    .cfi_undefined 1561
+; WAVE32-NEXT:    .cfi_undefined 1562
+; WAVE32-NEXT:    .cfi_undefined 1563
+; WAVE32-NEXT:    .cfi_undefined 1564
+; WAVE32-NEXT:    .cfi_undefined 1565
+; WAVE32-NEXT:    .cfi_undefined 1566
+; WAVE32-NEXT:    .cfi_undefined 1567
+; WAVE32-NEXT:    .cfi_undefined 1568
+; WAVE32-NEXT:    .cfi_undefined 1569
+; WAVE32-NEXT:    .cfi_undefined 1570
+; WAVE32-NEXT:    .cfi_undefined 1571
+; WAVE32-NEXT:    .cfi_undefined 1572
+; WAVE32-NEXT:    .cfi_undefined 1573
+; WAVE32-NEXT:    .cfi_undefined 1574
+; WAVE32-NEXT:    .cfi_undefined 1575
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE32-NEXT:    s_or_saveexec_b32 s4, -1
+; WAVE32-NEXT:    buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_offset 1576, 0
+; WAVE32-NEXT:    s_waitcnt_depctr 0xffe3
+; WAVE32-NEXT:    s_mov_b32 exec_lo, s4
+; WAVE32-NEXT:    v_writelane_b32 v40, exec_lo, 0
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1, 1576, 0, 32
+; WAVE32-NEXT:    ;;#ASMSTART
+; WAVE32-NEXT:    ; clobber nonpreserved VGPRs
+; WAVE32-NEXT:    ;;#ASMEND
+; WAVE32-NEXT:    s_or_saveexec_b32 s4, -1
+; WAVE32-NEXT:    buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; WAVE32-NEXT:    s_waitcnt_depctr 0xffe3
+; WAVE32-NEXT:    s_mov_b32 exec_lo, s4
+; WAVE32-NEXT:    s_waitcnt vmcnt(0)
+; WAVE32-NEXT:    s_setpc_b64 s[30:31]
+entry:
+  call void asm sideeffect "; clobber nonpreserved VGPRs",
+    "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
+    ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
+    ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
+    ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}"()
+  ret void
+}
+
+; There's no return here, so the return address live in was deleted.
+define void @empty_func() {
+; WAVE64-LABEL: empty_func:
+; WAVE64:       .Lfunc_begin3:
+; WAVE64-NEXT:    .cfi_startproc
+; WAVE64-NEXT:  ; %bb.0:
+; WAVE64-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; WAVE64-NEXT:    .cfi_llvm_register_pair 16, 62, 32, 63, 32
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE64-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; WAVE64-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_offset 2560, 0
+; WAVE64-NEXT:    s_mov_b64 exec, s[4:5]
+; WAVE64-NEXT:    v_writelane_b32 v0, exec_lo, 0
+; WAVE64-NEXT:    v_writelane_b32 v0, exec_hi, 1
+;
+; WAVE32-LABEL: empty_func:
+; WAVE32:       .Lfunc_begin3:
+; WAVE32-NEXT:    .cfi_startproc
+; WAVE32-NEXT:  ; %bb.0:
+; WAVE32-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; WAVE32-NEXT:    .cfi_llvm_register_pair 16, 62, 32, 63, 32
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE32-NEXT:    s_xor_saveexec_b32 s4, -1
+; WAVE32-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_offset 1536, 0
+; WAVE32-NEXT:    s_waitcnt_depctr 0xffe3
+; WAVE32-NEXT:    s_mov_b32 exec_lo, s4
+; WAVE32-NEXT:    v_writelane_b32 v0, exec_lo, 0
+  unreachable
+}
+
+; Check that the option causes RA and EXEC to be spilled to memory.
+define void @no_vgprs_to_spill_into() #1 {
+; WAVE64-LABEL: no_vgprs_to_spill_into:
+; WAVE64:       .Lfunc_begin4:
+; WAVE64-NEXT:    .cfi_startproc
+; WAVE64-NEXT:  ; %bb.0:
+; WAVE64-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; WAVE64-NEXT:    .cfi_llvm_register_pair 16, 62, 32, 63, 32
+; WAVE64-NEXT:    .cfi_undefined 2560
+; WAVE64-NEXT:    .cfi_undefined 2561
+; WAVE64-NEXT:    .cfi_undefined 2562
+; WAVE64-NEXT:    .cfi_undefined 2563
+; WAVE64-NEXT:    .cfi_undefined 2564
+; WAVE64-NEXT:    .cfi_undefined 2565
+; WAVE64-NEXT:    .cfi_undefined 2566
+; WAVE64-NEXT:    .cfi_undefined 2567
+; WAVE64-NEXT:    .cfi_undefined 2568
+; WAVE64-NEXT:    .cfi_undefined 2569
+; WAVE64-NEXT:    .cfi_undefined 2570
+; WAVE64-NEXT:    .cfi_undefined 2571
+; WAVE64-NEXT:    .cfi_undefined 2572
+; WAVE64-NEXT:    .cfi_undefined 2573
+; WAVE64-NEXT:    .cfi_undefined 2574
+; WAVE64-NEXT:    .cfi_undefined 2575
+; WAVE64-NEXT:    .cfi_undefined 2576
+; WAVE64-NEXT:    .cfi_undefined 2577
+; WAVE64-NEXT:    .cfi_undefined 2578
+; WAVE64-NEXT:    .cfi_undefined 2579
+; WAVE64-NEXT:    .cfi_undefined 2580
+; WAVE64-NEXT:    .cfi_undefined 2581
+; WAVE64-NEXT:    .cfi_undefined 2582
+; WAVE64-NEXT:    .cfi_undefined 2583
+; WAVE64-NEXT:    .cfi_undefined 2584
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE64-NEXT:    v_mov_b32_e32 v0, exec_lo
+; WAVE64-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE64-NEXT:    v_mov_b32_e32 v0, exec_hi
+; WAVE64-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_offset 17, 0
+; WAVE64-NEXT:    ;;#ASMSTART
+; WAVE64-NEXT:    ;;#ASMEND
+; WAVE64-NEXT:    s_waitcnt vmcnt(0)
+; WAVE64-NEXT:    s_setpc_b64 s[30:31]
+;
+; WAVE32-LABEL: no_vgprs_to_spill_into:
+; WAVE32:       .Lfunc_begin4:
+; WAVE32-NEXT:    .cfi_startproc
+; WAVE32-NEXT:  ; %bb.0:
+; WAVE32-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; WAVE32-NEXT:    .cfi_llvm_register_pair 16, 62, 32, 63, 32
+; WAVE32-NEXT:    .cfi_undefined 1536
+; WAVE32-NEXT:    .cfi_undefined 1537
+; WAVE32-NEXT:    .cfi_undefined 1538
+; WAVE32-NEXT:    .cfi_undefined 1539
+; WAVE32-NEXT:    .cfi_undefined 1540
+; WAVE32-NEXT:    .cfi_undefined 1541
+; WAVE32-NEXT:    .cfi_undefined 1542
+; WAVE32-NEXT:    .cfi_undefined 1543
+; WAVE32-NEXT:    .cfi_undefined 1544
+; WAVE32-NEXT:    .cfi_undefined 1545
+; WAVE32-NEXT:    .cfi_undefined 1546
+; WAVE32-NEXT:    .cfi_undefined 1547
+; WAVE32-NEXT:    .cfi_undefined 1548
+; WAVE32-NEXT:    .cfi_undefined 1549
+; WAVE32-NEXT:    .cfi_undefined 1550
+; WAVE32-NEXT:    .cfi_undefined 1551
+; WAVE32-NEXT:    .cfi_undefined 1552
+; WAVE32-NEXT:    .cfi_undefined 1553
+; WAVE32-NEXT:    .cfi_undefined 1554
+; WAVE32-NEXT:    .cfi_undefined 1555
+; WAVE32-NEXT:    .cfi_undefined 1556
+; WAVE32-NEXT:    .cfi_undefined 1557
+; WAVE32-NEXT:    .cfi_undefined 1558
+; WAVE32-NEXT:    .cfi_undefined 1559
+; WAVE32-NEXT:    .cfi_undefined 1560
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE32-NEXT:    s_xor_saveexec_b32 s4, -1
+; WAVE32-NEXT:    buffer_store_dword v25, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_offset 1561, 0
+; WAVE32-NEXT:    s_waitcnt_depctr 0xffe3
+; WAVE32-NEXT:    s_mov_b32 exec_lo, s4
+; WAVE32-NEXT:    v_writelane_b32 v25, exec_lo, 0
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1, 1561, 0, 32
+; WAVE32-NEXT:    ;;#ASMSTART
+; WAVE32-NEXT:    ;;#ASMEND
+; WAVE32-NEXT:    s_xor_saveexec_b32 s4, -1
+; WAVE32-NEXT:    buffer_load_dword v25, off, s[0:3], s32 ; 4-byte Folded Reload
+; WAVE32-NEXT:    s_waitcnt_depctr 0xffe3
+; WAVE32-NEXT:    s_mov_b32 exec_lo, s4
+; WAVE32-NEXT:    s_waitcnt vmcnt(0)
+; WAVE32-NEXT:    s_setpc_b64 s[30:31]
+  call void asm sideeffect "",
+    "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
+    ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
+    ,~{v20},~{v21},~{v22},~{v23},~{v24}"()
+
+  ret void
+}
+
+; Check that the FP and EXEC needs to be spilled to memory, even though
+; we have reserved VGPR but there are no available free lanes.
+define void @callee_need_to_spill_fp_exec_to_memory() #2 {
+; WAVE64-LABEL: callee_need_to_spill_fp_exec_to_memory:
+; WAVE64:       .Lfunc_begin5:
+; WAVE64-NEXT:    .cfi_startproc
+; WAVE64-NEXT:  ; %bb.0:
+; WAVE64-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; WAVE64-NEXT:    .cfi_llvm_register_pair 16, 62, 32, 63, 32
+; WAVE64-NEXT:    .cfi_undefined 2560
+; WAVE64-NEXT:    .cfi_undefined 2561
+; WAVE64-NEXT:    .cfi_undefined 2562
+; WAVE64-NEXT:    .cfi_undefined 2563
+; WAVE64-NEXT:    .cfi_undefined 2564
+; WAVE64-NEXT:    .cfi_undefined 2565
+; WAVE64-NEXT:    .cfi_undefined 2566
+; WAVE64-NEXT:    .cfi_undefined 2567
+; WAVE64-NEXT:    .cfi_undefined 2568
+; WAVE64-NEXT:    .cfi_undefined 2569
+; WAVE64-NEXT:    .cfi_undefined 2570
+; WAVE64-NEXT:    .cfi_undefined 2571
+; WAVE64-NEXT:    .cfi_undefined 2572
+; WAVE64-NEXT:    .cfi_undefined 2573
+; WAVE64-NEXT:    .cfi_undefined 2574
+; WAVE64-NEXT:    .cfi_undefined 2575
+; WAVE64-NEXT:    .cfi_undefined 2576
+; WAVE64-NEXT:    .cfi_undefined 2577
+; WAVE64-NEXT:    .cfi_undefined 2578
+; WAVE64-NEXT:    .cfi_undefined 2579
+; WAVE64-NEXT:    .cfi_undefined 2580
+; WAVE64-NEXT:    .cfi_undefined 2581
+; WAVE64-NEXT:    .cfi_undefined 2582
+; WAVE64-NEXT:    .cfi_undefined 2583
+; WAVE64-NEXT:    .cfi_undefined 2584
+; WAVE64-NEXT:    .cfi_undefined 2585
+; WAVE64-NEXT:    .cfi_undefined 2586
+; WAVE64-NEXT:    .cfi_undefined 2587
+; WAVE64-NEXT:    .cfi_undefined 2588
+; WAVE64-NEXT:    .cfi_undefined 2589
+; WAVE64-NEXT:    .cfi_undefined 2590
+; WAVE64-NEXT:    .cfi_undefined 2591
+; WAVE64-NEXT:    .cfi_undefined 2592
+; WAVE64-NEXT:    .cfi_undefined 2593
+; WAVE64-NEXT:    .cfi_undefined 2594
+; WAVE64-NEXT:    .cfi_undefined 2595
+; WAVE64-NEXT:    .cfi_undefined 2596
+; WAVE64-NEXT:    .cfi_undefined 2597
+; WAVE64-NEXT:    .cfi_undefined 2598
+; WAVE64-NEXT:    .cfi_undefined 2599
+; WAVE64-NEXT:    .cfi_undefined 2608
+; WAVE64-NEXT:    .cfi_undefined 2609
+; WAVE64-NEXT:    .cfi_undefined 2610
+; WAVE64-NEXT:    .cfi_undefined 2611
+; WAVE64-NEXT:    .cfi_undefined 2612
+; WAVE64-NEXT:    .cfi_undefined 2613
+; WAVE64-NEXT:    .cfi_undefined 2614
+; WAVE64-NEXT:    .cfi_undefined 2615
+; WAVE64-NEXT:    .cfi_undefined 2624
+; WAVE64-NEXT:    .cfi_undefined 2625
+; WAVE64-NEXT:    .cfi_undefined 2626
+; WAVE64-NEXT:    .cfi_undefined 2627
+; WAVE64-NEXT:    .cfi_undefined 2628
+; WAVE64-NEXT:    .cfi_undefined 2629
+; WAVE64-NEXT:    .cfi_undefined 2630
+; WAVE64-NEXT:    .cfi_undefined 2631
+; WAVE64-NEXT:    .cfi_undefined 2640
+; WAVE64-NEXT:    .cfi_undefined 2641
+; WAVE64-NEXT:    .cfi_undefined 2642
+; WAVE64-NEXT:    .cfi_undefined 2643
+; WAVE64-NEXT:    .cfi_undefined 2644
+; WAVE64-NEXT:    .cfi_undefined 2645
+; WAVE64-NEXT:    .cfi_undefined 2646
+; WAVE64-NEXT:    .cfi_undefined 2647
+; WAVE64-NEXT:    .cfi_undefined 2656
+; WAVE64-NEXT:    .cfi_undefined 2657
+; WAVE64-NEXT:    .cfi_undefined 2658
+; WAVE64-NEXT:    .cfi_undefined 2659
+; WAVE64-NEXT:    .cfi_undefined 2660
+; WAVE64-NEXT:    .cfi_undefined 2661
+; WAVE64-NEXT:    .cfi_undefined 2662
+; WAVE64-NEXT:    .cfi_undefined 2663
+; WAVE64-NEXT:    .cfi_undefined 2672
+; WAVE64-NEXT:    .cfi_undefined 2673
+; WAVE64-NEXT:    .cfi_undefined 2674
+; WAVE64-NEXT:    .cfi_undefined 2675
+; WAVE64-NEXT:    .cfi_undefined 2676
+; WAVE64-NEXT:    .cfi_undefined 2677
+; WAVE64-NEXT:    .cfi_undefined 2678
+; WAVE64-NEXT:    .cfi_undefined 2679
+; WAVE64-NEXT:    .cfi_undefined 2688
+; WAVE64-NEXT:    .cfi_undefined 2689
+; WAVE64-NEXT:    .cfi_undefined 36
+; WAVE64-NEXT:    .cfi_undefined 37
+; WAVE64-NEXT:    .cfi_undefined 38
+; WAVE64-NEXT:    .cfi_undefined 39
+; WAVE64-NEXT:    .cfi_undefined 40
+; WAVE64-NEXT:    .cfi_undefined 41
+; WAVE64-NEXT:    .cfi_undefined 42
+; WAVE64-NEXT:    .cfi_undefined 43
+; WAVE64-NEXT:    .cfi_undefined 44
+; WAVE64-NEXT:    .cfi_undefined 45
+; WAVE64-NEXT:    .cfi_undefined 46
+; WAVE64-NEXT:    .cfi_undefined 47
+; WAVE64-NEXT:    .cfi_undefined 48
+; WAVE64-NEXT:    .cfi_undefined 49
+; WAVE64-NEXT:    .cfi_undefined 50
+; WAVE64-NEXT:    .cfi_undefined 51
+; WAVE64-NEXT:    .cfi_undefined 52
+; WAVE64-NEXT:    .cfi_undefined 53
+; WAVE64-NEXT:    .cfi_undefined 54
+; WAVE64-NEXT:    .cfi_undefined 55
+; WAVE64-NEXT:    .cfi_undefined 56
+; WAVE64-NEXT:    .cfi_undefined 57
+; WAVE64-NEXT:    .cfi_undefined 58
+; WAVE64-NEXT:    .cfi_undefined 59
+; WAVE64-NEXT:    .cfi_undefined 60
+; WAVE64-NEXT:    .cfi_undefined 61
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE64-NEXT:    s_mov_b32 s40, s33
+; WAVE64-NEXT:    .cfi_register 65, 72
+; WAVE64-NEXT:    s_mov_b32 s33, s32
+; WAVE64-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; WAVE64-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_offset 2599, 12288
+; WAVE64-NEXT:    s_mov_b64 exec, s[4:5]
+; WAVE64-NEXT:    v_writelane_b32 v39, exec_lo, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, exec_hi, 33
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 17, 2599, 32, 32, 2599, 33, 32
+; WAVE64-NEXT:    .cfi_def_cfa_register 65
+; WAVE64-NEXT:    s_addk_i32 s32, 0x3200
+; WAVE64-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2600, 32, 17, 64, 12032
+; WAVE64-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2601, 32, 17, 64, 11776
+; WAVE64-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2602, 32, 17, 64, 11520
+; WAVE64-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2603, 32, 17, 64, 11264
+; WAVE64-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2604, 32, 17, 64, 11008
+; WAVE64-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2605, 32, 17, 64, 10752
+; WAVE64-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2606, 32, 17, 64, 10496
+; WAVE64-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2607, 32, 17, 64, 10240
+; WAVE64-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2616, 32, 17, 64, 9984
+; WAVE64-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2617, 32, 17, 64, 9728
+; WAVE64-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2618, 32, 17, 64, 9472
+; WAVE64-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2619, 32, 17, 64, 9216
+; WAVE64-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2620, 32, 17, 64, 8960
+; WAVE64-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2621, 32, 17, 64, 8704
+; WAVE64-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2622, 32, 17, 64, 8448
+; WAVE64-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2623, 32, 17, 64, 8192
+; WAVE64-NEXT:    buffer_store_dword v72, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2632, 32, 17, 64, 7936
+; WAVE64-NEXT:    buffer_store_dword v73, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2633, 32, 17, 64, 7680
+; WAVE64-NEXT:    buffer_store_dword v74, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2634, 32, 17, 64, 7424
+; WAVE64-NEXT:    buffer_store_dword v75, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2635, 32, 17, 64, 7168
+; WAVE64-NEXT:    buffer_store_dword v76, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2636, 32, 17, 64, 6912
+; WAVE64-NEXT:    buffer_store_dword v77, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2637, 32, 17, 64, 6656
+; WAVE64-NEXT:    buffer_store_dword v78, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2638, 32, 17, 64, 6400
+; WAVE64-NEXT:    buffer_store_dword v79, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2639, 32, 17, 64, 6144
+; WAVE64-NEXT:    buffer_store_dword v88, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2648, 32, 17, 64, 5888
+; WAVE64-NEXT:    buffer_store_dword v89, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2649, 32, 17, 64, 5632
+; WAVE64-NEXT:    buffer_store_dword v90, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2650, 32, 17, 64, 5376
+; WAVE64-NEXT:    buffer_store_dword v91, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2651, 32, 17, 64, 5120
+; WAVE64-NEXT:    buffer_store_dword v92, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2652, 32, 17, 64, 4864
+; WAVE64-NEXT:    buffer_store_dword v93, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2653, 32, 17, 64, 4608
+; WAVE64-NEXT:    buffer_store_dword v94, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2654, 32, 17, 64, 4352
+; WAVE64-NEXT:    buffer_store_dword v95, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2655, 32, 17, 64, 4096
+; WAVE64-NEXT:    buffer_store_dword v104, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2664, 32, 17, 64, 3840
+; WAVE64-NEXT:    buffer_store_dword v105, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2665, 32, 17, 64, 3584
+; WAVE64-NEXT:    buffer_store_dword v106, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2666, 32, 17, 64, 3328
+; WAVE64-NEXT:    buffer_store_dword v107, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2667, 32, 17, 64, 3072
+; WAVE64-NEXT:    buffer_store_dword v108, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2668, 32, 17, 64, 2816
+; WAVE64-NEXT:    buffer_store_dword v109, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2669, 32, 17, 64, 2560
+; WAVE64-NEXT:    buffer_store_dword v110, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2670, 32, 17, 64, 2304
+; WAVE64-NEXT:    buffer_store_dword v111, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2671, 32, 17, 64, 2048
+; WAVE64-NEXT:    buffer_store_dword v120, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2680, 32, 17, 64, 1792
+; WAVE64-NEXT:    buffer_store_dword v121, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2681, 32, 17, 64, 1536
+; WAVE64-NEXT:    buffer_store_dword v122, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2682, 32, 17, 64, 1280
+; WAVE64-NEXT:    buffer_store_dword v123, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2683, 32, 17, 64, 1024
+; WAVE64-NEXT:    buffer_store_dword v124, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2684, 32, 17, 64, 768
+; WAVE64-NEXT:    buffer_store_dword v125, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2685, 32, 17, 64, 512
+; WAVE64-NEXT:    buffer_store_dword v126, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2686, 32, 17, 64, 256
+; WAVE64-NEXT:    buffer_store_dword v127, off, s[0:3], s33 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2687, 32, 17, 64, 0
+; WAVE64-NEXT:    v_writelane_b32 v39, s34, 0
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 66, 2599, 0, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s35, 1
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 67, 2599, 1, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s36, 2
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 68, 2599, 2, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s37, 3
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 69, 2599, 3, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s38, 4
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 70, 2599, 4, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s39, 5
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 71, 2599, 5, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s48, 6
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 80, 2599, 6, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s49, 7
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 81, 2599, 7, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s50, 8
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 82, 2599, 8, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s51, 9
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 83, 2599, 9, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s52, 10
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 84, 2599, 10, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s53, 11
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 85, 2599, 11, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s54, 12
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 86, 2599, 12, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s55, 13
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 87, 2599, 13, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s64, 14
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1088, 2599, 14, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s65, 15
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1089, 2599, 15, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s66, 16
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1090, 2599, 16, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s67, 17
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1091, 2599, 17, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s68, 18
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1092, 2599, 18, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s69, 19
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1093, 2599, 19, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s70, 20
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1094, 2599, 20, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s71, 21
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1095, 2599, 21, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s80, 22
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1104, 2599, 22, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s81, 23
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1105, 2599, 23, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s82, 24
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1106, 2599, 24, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s83, 25
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1107, 2599, 25, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s84, 26
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1108, 2599, 26, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s85, 27
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1109, 2599, 27, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s86, 28
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1110, 2599, 28, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s87, 29
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1111, 2599, 29, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s96, 30
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1120, 2599, 30, 32
+; WAVE64-NEXT:    v_writelane_b32 v39, s97, 31
+; WAVE64-NEXT:    .cfi_llvm_vector_registers 1121, 2599, 31, 32
+; WAVE64-NEXT:    ;;#ASMSTART
+; WAVE64-NEXT:    ; clobber nonpreserved and 32 CSR SGPRs
+; WAVE64-NEXT:    ;;#ASMEND
+; WAVE64-NEXT:    ;;#ASMSTART
+; WAVE64-NEXT:    ; clobber all VGPRs except v39
+; WAVE64-NEXT:    ;;#ASMEND
+; WAVE64-NEXT:    buffer_load_dword v127, off, s[0:3], s33 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v126, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v125, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v124, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v123, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v122, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v121, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v120, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v111, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v110, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v109, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v108, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v107, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v106, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v105, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v104, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v95, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v94, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v93, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v92, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v91, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v90, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v89, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v88, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v79, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v78, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v77, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v76, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v75, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v74, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v73, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v72, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:172 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:176 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:180 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:184 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:188 ; 4-byte Folded Reload
+; WAVE64-NEXT:    v_readlane_b32 s97, v39, 31
+; WAVE64-NEXT:    v_readlane_b32 s96, v39, 30
+; WAVE64-NEXT:    v_readlane_b32 s87, v39, 29
+; WAVE64-NEXT:    v_readlane_b32 s86, v39, 28
+; WAVE64-NEXT:    v_readlane_b32 s85, v39, 27
+; WAVE64-NEXT:    v_readlane_b32 s84, v39, 26
+; WAVE64-NEXT:    v_readlane_b32 s83, v39, 25
+; WAVE64-NEXT:    v_readlane_b32 s82, v39, 24
+; WAVE64-NEXT:    v_readlane_b32 s81, v39, 23
+; WAVE64-NEXT:    v_readlane_b32 s80, v39, 22
+; WAVE64-NEXT:    v_readlane_b32 s71, v39, 21
+; WAVE64-NEXT:    v_readlane_b32 s70, v39, 20
+; WAVE64-NEXT:    v_readlane_b32 s69, v39, 19
+; WAVE64-NEXT:    v_readlane_b32 s68, v39, 18
+; WAVE64-NEXT:    v_readlane_b32 s67, v39, 17
+; WAVE64-NEXT:    v_readlane_b32 s66, v39, 16
+; WAVE64-NEXT:    v_readlane_b32 s65, v39, 15
+; WAVE64-NEXT:    v_readlane_b32 s64, v39, 14
+; WAVE64-NEXT:    v_readlane_b32 s55, v39, 13
+; WAVE64-NEXT:    v_readlane_b32 s54, v39, 12
+; WAVE64-NEXT:    v_readlane_b32 s53, v39, 11
+; WAVE64-NEXT:    v_readlane_b32 s52, v39, 10
+; WAVE64-NEXT:    v_readlane_b32 s51, v39, 9
+; WAVE64-NEXT:    v_readlane_b32 s50, v39, 8
+; WAVE64-NEXT:    v_readlane_b32 s49, v39, 7
+; WAVE64-NEXT:    v_readlane_b32 s48, v39, 6
+; WAVE64-NEXT:    v_readlane_b32 s39, v39, 5
+; WAVE64-NEXT:    v_readlane_b32 s38, v39, 4
+; WAVE64-NEXT:    v_readlane_b32 s37, v39, 3
+; WAVE64-NEXT:    v_readlane_b32 s36, v39, 2
+; WAVE64-NEXT:    v_readlane_b32 s35, v39, 1
+; WAVE64-NEXT:    v_readlane_b32 s34, v39, 0
+; WAVE64-NEXT:    s_mov_b32 s32, s33
+; WAVE64-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; WAVE64-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:192 ; 4-byte Folded Reload
+; WAVE64-NEXT:    s_mov_b64 exec, s[4:5]
+; WAVE64-NEXT:    .cfi_def_cfa_register 64
+; WAVE64-NEXT:    s_mov_b32 s33, s40
+; WAVE64-NEXT:    s_waitcnt vmcnt(0)
+; WAVE64-NEXT:    s_setpc_b64 s[30:31]
+;
+; WAVE32-LABEL: callee_need_to_spill_fp_exec_to_memory:
+; WAVE32:       .Lfunc_begin5:
+; WAVE32-NEXT:    .cfi_startproc
+; WAVE32-NEXT:  ; %bb.0:
+; WAVE32-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; WAVE32-NEXT:    .cfi_llvm_register_pair 16, 62, 32, 63, 32
+; WAVE32-NEXT:    .cfi_undefined 1536
+; WAVE32-NEXT:    .cfi_undefined 1537
+; WAVE32-NEXT:    .cfi_undefined 1538
+; WAVE32-NEXT:    .cfi_undefined 1539
+; WAVE32-NEXT:    .cfi_undefined 1540
+; WAVE32-NEXT:    .cfi_undefined 1541
+; WAVE32-NEXT:    .cfi_undefined 1542
+; WAVE32-NEXT:    .cfi_undefined 1543
+; WAVE32-NEXT:    .cfi_undefined 1544
+; WAVE32-NEXT:    .cfi_undefined 1545
+; WAVE32-NEXT:    .cfi_undefined 1546
+; WAVE32-NEXT:    .cfi_undefined 1547
+; WAVE32-NEXT:    .cfi_undefined 1548
+; WAVE32-NEXT:    .cfi_undefined 1549
+; WAVE32-NEXT:    .cfi_undefined 1550
+; WAVE32-NEXT:    .cfi_undefined 1551
+; WAVE32-NEXT:    .cfi_undefined 1552
+; WAVE32-NEXT:    .cfi_undefined 1553
+; WAVE32-NEXT:    .cfi_undefined 1554
+; WAVE32-NEXT:    .cfi_undefined 1555
+; WAVE32-NEXT:    .cfi_undefined 1556
+; WAVE32-NEXT:    .cfi_undefined 1557
+; WAVE32-NEXT:    .cfi_undefined 1558
+; WAVE32-NEXT:    .cfi_undefined 1559
+; WAVE32-NEXT:    .cfi_undefined 1560
+; WAVE32-NEXT:    .cfi_undefined 1561
+; WAVE32-NEXT:    .cfi_undefined 1562
+; WAVE32-NEXT:    .cfi_undefined 1563
+; WAVE32-NEXT:    .cfi_undefined 1564
+; WAVE32-NEXT:    .cfi_undefined 1565
+; WAVE32-NEXT:    .cfi_undefined 1566
+; WAVE32-NEXT:    .cfi_undefined 1567
+; WAVE32-NEXT:    .cfi_undefined 1568
+; WAVE32-NEXT:    .cfi_undefined 1569
+; WAVE32-NEXT:    .cfi_undefined 1570
+; WAVE32-NEXT:    .cfi_undefined 1571
+; WAVE32-NEXT:    .cfi_undefined 1572
+; WAVE32-NEXT:    .cfi_undefined 1573
+; WAVE32-NEXT:    .cfi_undefined 1574
+; WAVE32-NEXT:    .cfi_undefined 1575
+; WAVE32-NEXT:    .cfi_undefined 1584
+; WAVE32-NEXT:    .cfi_undefined 1585
+; WAVE32-NEXT:    .cfi_undefined 1586
+; WAVE32-NEXT:    .cfi_undefined 1587
+; WAVE32-NEXT:    .cfi_undefined 1588
+; WAVE32-NEXT:    .cfi_undefined 1589
+; WAVE32-NEXT:    .cfi_undefined 1590
+; WAVE32-NEXT:    .cfi_undefined 1591
+; WAVE32-NEXT:    .cfi_undefined 1600
+; WAVE32-NEXT:    .cfi_undefined 1601
+; WAVE32-NEXT:    .cfi_undefined 1602
+; WAVE32-NEXT:    .cfi_undefined 1603
+; WAVE32-NEXT:    .cfi_undefined 1604
+; WAVE32-NEXT:    .cfi_undefined 1605
+; WAVE32-NEXT:    .cfi_undefined 1606
+; WAVE32-NEXT:    .cfi_undefined 1607
+; WAVE32-NEXT:    .cfi_undefined 1616
+; WAVE32-NEXT:    .cfi_undefined 1617
+; WAVE32-NEXT:    .cfi_undefined 1618
+; WAVE32-NEXT:    .cfi_undefined 1619
+; WAVE32-NEXT:    .cfi_undefined 1620
+; WAVE32-NEXT:    .cfi_undefined 1621
+; WAVE32-NEXT:    .cfi_undefined 1622
+; WAVE32-NEXT:    .cfi_undefined 1623
+; WAVE32-NEXT:    .cfi_undefined 1632
+; WAVE32-NEXT:    .cfi_undefined 1633
+; WAVE32-NEXT:    .cfi_undefined 1634
+; WAVE32-NEXT:    .cfi_undefined 1635
+; WAVE32-NEXT:    .cfi_undefined 1636
+; WAVE32-NEXT:    .cfi_undefined 1637
+; WAVE32-NEXT:    .cfi_undefined 1638
+; WAVE32-NEXT:    .cfi_undefined 1639
+; WAVE32-NEXT:    .cfi_undefined 1648
+; WAVE32-NEXT:    .cfi_undefined 1649
+; WAVE32-NEXT:    .cfi_undefined 1650
+; WAVE32-NEXT:    .cfi_undefined 1651
+; WAVE32-NEXT:    .cfi_undefined 1652
+; WAVE32-NEXT:    .cfi_undefined 1653
+; WAVE32-NEXT:    .cfi_undefined 1654
+; WAVE32-NEXT:    .cfi_undefined 1655
+; WAVE32-NEXT:    .cfi_undefined 1664
+; WAVE32-NEXT:    .cfi_undefined 1665
+; WAVE32-NEXT:    .cfi_undefined 36
+; WAVE32-NEXT:    .cfi_undefined 37
+; WAVE32-NEXT:    .cfi_undefined 38
+; WAVE32-NEXT:    .cfi_undefined 39
+; WAVE32-NEXT:    .cfi_undefined 40
+; WAVE32-NEXT:    .cfi_undefined 41
+; WAVE32-NEXT:    .cfi_undefined 42
+; WAVE32-NEXT:    .cfi_undefined 43
+; WAVE32-NEXT:    .cfi_undefined 44
+; WAVE32-NEXT:    .cfi_undefined 45
+; WAVE32-NEXT:    .cfi_undefined 46
+; WAVE32-NEXT:    .cfi_undefined 47
+; WAVE32-NEXT:    .cfi_undefined 48
+; WAVE32-NEXT:    .cfi_undefined 49
+; WAVE32-NEXT:    .cfi_undefined 50
+; WAVE32-NEXT:    .cfi_undefined 51
+; WAVE32-NEXT:    .cfi_undefined 52
+; WAVE32-NEXT:    .cfi_undefined 53
+; WAVE32-NEXT:    .cfi_undefined 54
+; WAVE32-NEXT:    .cfi_undefined 55
+; WAVE32-NEXT:    .cfi_undefined 56
+; WAVE32-NEXT:    .cfi_undefined 57
+; WAVE32-NEXT:    .cfi_undefined 58
+; WAVE32-NEXT:    .cfi_undefined 59
+; WAVE32-NEXT:    .cfi_undefined 60
+; WAVE32-NEXT:    .cfi_undefined 61
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE32-NEXT:    s_mov_b32 s40, s33
+; WAVE32-NEXT:    .cfi_register 65, 72
+; WAVE32-NEXT:    s_mov_b32 s33, s32
+; WAVE32-NEXT:    s_xor_saveexec_b32 s4, -1
+; WAVE32-NEXT:    buffer_store_dword v39, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_offset 1575, 6144
+; WAVE32-NEXT:    s_waitcnt_depctr 0xffe3
+; WAVE32-NEXT:    s_mov_b32 exec_lo, s4
+; WAVE32-NEXT:    v_mov_b32_e32 v0, exec_lo
+; WAVE32-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:196 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_offset 1, 6272
+; WAVE32-NEXT:    .cfi_def_cfa_register 65
+; WAVE32-NEXT:    s_addk_i32 s32, 0x1980
+; WAVE32-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1576, 32, 1, 32, 6016
+; WAVE32-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1577, 32, 1, 32, 5888
+; WAVE32-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1578, 32, 1, 32, 5760
+; WAVE32-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1579, 32, 1, 32, 5632
+; WAVE32-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1580, 32, 1, 32, 5504
+; WAVE32-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1581, 32, 1, 32, 5376
+; WAVE32-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1582, 32, 1, 32, 5248
+; WAVE32-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1583, 32, 1, 32, 5120
+; WAVE32-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1592, 32, 1, 32, 4992
+; WAVE32-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1593, 32, 1, 32, 4864
+; WAVE32-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1594, 32, 1, 32, 4736
+; WAVE32-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1595, 32, 1, 32, 4608
+; WAVE32-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1596, 32, 1, 32, 4480
+; WAVE32-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1597, 32, 1, 32, 4352
+; WAVE32-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1598, 32, 1, 32, 4224
+; WAVE32-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1599, 32, 1, 32, 4096
+; WAVE32-NEXT:    buffer_store_dword v72, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1608, 32, 1, 32, 3968
+; WAVE32-NEXT:    buffer_store_dword v73, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1609, 32, 1, 32, 3840
+; WAVE32-NEXT:    buffer_store_dword v74, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1610, 32, 1, 32, 3712
+; WAVE32-NEXT:    buffer_store_dword v75, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1611, 32, 1, 32, 3584
+; WAVE32-NEXT:    buffer_store_dword v76, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1612, 32, 1, 32, 3456
+; WAVE32-NEXT:    buffer_store_dword v77, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1613, 32, 1, 32, 3328
+; WAVE32-NEXT:    buffer_store_dword v78, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1614, 32, 1, 32, 3200
+; WAVE32-NEXT:    buffer_store_dword v79, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1615, 32, 1, 32, 3072
+; WAVE32-NEXT:    buffer_store_dword v88, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1624, 32, 1, 32, 2944
+; WAVE32-NEXT:    buffer_store_dword v89, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1625, 32, 1, 32, 2816
+; WAVE32-NEXT:    buffer_store_dword v90, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1626, 32, 1, 32, 2688
+; WAVE32-NEXT:    buffer_store_dword v91, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1627, 32, 1, 32, 2560
+; WAVE32-NEXT:    buffer_store_dword v92, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1628, 32, 1, 32, 2432
+; WAVE32-NEXT:    buffer_store_dword v93, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1629, 32, 1, 32, 2304
+; WAVE32-NEXT:    buffer_store_dword v94, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1630, 32, 1, 32, 2176
+; WAVE32-NEXT:    buffer_store_dword v95, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1631, 32, 1, 32, 2048
+; WAVE32-NEXT:    buffer_store_dword v104, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1640, 32, 1, 32, 1920
+; WAVE32-NEXT:    buffer_store_dword v105, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1641, 32, 1, 32, 1792
+; WAVE32-NEXT:    buffer_store_dword v106, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1642, 32, 1, 32, 1664
+; WAVE32-NEXT:    buffer_store_dword v107, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1643, 32, 1, 32, 1536
+; WAVE32-NEXT:    buffer_store_dword v108, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1644, 32, 1, 32, 1408
+; WAVE32-NEXT:    buffer_store_dword v109, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1645, 32, 1, 32, 1280
+; WAVE32-NEXT:    buffer_store_dword v110, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1646, 32, 1, 32, 1152
+; WAVE32-NEXT:    buffer_store_dword v111, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1647, 32, 1, 32, 1024
+; WAVE32-NEXT:    buffer_store_dword v120, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1656, 32, 1, 32, 896
+; WAVE32-NEXT:    buffer_store_dword v121, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1657, 32, 1, 32, 768
+; WAVE32-NEXT:    buffer_store_dword v122, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1658, 32, 1, 32, 640
+; WAVE32-NEXT:    buffer_store_dword v123, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1659, 32, 1, 32, 512
+; WAVE32-NEXT:    buffer_store_dword v124, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1660, 32, 1, 32, 384
+; WAVE32-NEXT:    buffer_store_dword v125, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1661, 32, 1, 32, 256
+; WAVE32-NEXT:    buffer_store_dword v126, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1662, 32, 1, 32, 128
+; WAVE32-NEXT:    buffer_store_dword v127, off, s[0:3], s33 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1663, 32, 1, 32, 0
+; WAVE32-NEXT:    v_writelane_b32 v39, s34, 0
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 66, 1575, 0, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s35, 1
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 67, 1575, 1, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s36, 2
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 68, 1575, 2, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s37, 3
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 69, 1575, 3, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s38, 4
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 70, 1575, 4, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s39, 5
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 71, 1575, 5, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s48, 6
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 80, 1575, 6, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s49, 7
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 81, 1575, 7, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s50, 8
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 82, 1575, 8, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s51, 9
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 83, 1575, 9, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s52, 10
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 84, 1575, 10, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s53, 11
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 85, 1575, 11, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s54, 12
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 86, 1575, 12, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s55, 13
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 87, 1575, 13, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s64, 14
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1088, 1575, 14, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s65, 15
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1089, 1575, 15, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s66, 16
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1090, 1575, 16, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s67, 17
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1091, 1575, 17, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s68, 18
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1092, 1575, 18, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s69, 19
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1093, 1575, 19, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s70, 20
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1094, 1575, 20, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s71, 21
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1095, 1575, 21, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s80, 22
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1104, 1575, 22, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s81, 23
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1105, 1575, 23, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s82, 24
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1106, 1575, 24, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s83, 25
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1107, 1575, 25, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s84, 26
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1108, 1575, 26, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s85, 27
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1109, 1575, 27, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s86, 28
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1110, 1575, 28, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s87, 29
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1111, 1575, 29, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s96, 30
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1120, 1575, 30, 32
+; WAVE32-NEXT:    v_writelane_b32 v39, s97, 31
+; WAVE32-NEXT:    .cfi_llvm_vector_registers 1121, 1575, 31, 32
+; WAVE32-NEXT:    ;;#ASMSTART
+; WAVE32-NEXT:    ; clobber nonpreserved and 32 CSR SGPRs
+; WAVE32-NEXT:    ;;#ASMEND
+; WAVE32-NEXT:    ;;#ASMSTART
+; WAVE32-NEXT:    ; clobber all VGPRs except v39
+; WAVE32-NEXT:    ;;#ASMEND
+; WAVE32-NEXT:    s_clause 0x2f
+; WAVE32-NEXT:    buffer_load_dword v127, off, s[0:3], s33
+; WAVE32-NEXT:    buffer_load_dword v126, off, s[0:3], s33 offset:4
+; WAVE32-NEXT:    buffer_load_dword v125, off, s[0:3], s33 offset:8
+; WAVE32-NEXT:    buffer_load_dword v124, off, s[0:3], s33 offset:12
+; WAVE32-NEXT:    buffer_load_dword v123, off, s[0:3], s33 offset:16
+; WAVE32-NEXT:    buffer_load_dword v122, off, s[0:3], s33 offset:20
+; WAVE32-NEXT:    buffer_load_dword v121, off, s[0:3], s33 offset:24
+; WAVE32-NEXT:    buffer_load_dword v120, off, s[0:3], s33 offset:28
+; WAVE32-NEXT:    buffer_load_dword v111, off, s[0:3], s33 offset:32
+; WAVE32-NEXT:    buffer_load_dword v110, off, s[0:3], s33 offset:36
+; WAVE32-NEXT:    buffer_load_dword v109, off, s[0:3], s33 offset:40
+; WAVE32-NEXT:    buffer_load_dword v108, off, s[0:3], s33 offset:44
+; WAVE32-NEXT:    buffer_load_dword v107, off, s[0:3], s33 offset:48
+; WAVE32-NEXT:    buffer_load_dword v106, off, s[0:3], s33 offset:52
+; WAVE32-NEXT:    buffer_load_dword v105, off, s[0:3], s33 offset:56
+; WAVE32-NEXT:    buffer_load_dword v104, off, s[0:3], s33 offset:60
+; WAVE32-NEXT:    buffer_load_dword v95, off, s[0:3], s33 offset:64
+; WAVE32-NEXT:    buffer_load_dword v94, off, s[0:3], s33 offset:68
+; WAVE32-NEXT:    buffer_load_dword v93, off, s[0:3], s33 offset:72
+; WAVE32-NEXT:    buffer_load_dword v92, off, s[0:3], s33 offset:76
+; WAVE32-NEXT:    buffer_load_dword v91, off, s[0:3], s33 offset:80
+; WAVE32-NEXT:    buffer_load_dword v90, off, s[0:3], s33 offset:84
+; WAVE32-NEXT:    buffer_load_dword v89, off, s[0:3], s33 offset:88
+; WAVE32-NEXT:    buffer_load_dword v88, off, s[0:3], s33 offset:92
+; WAVE32-NEXT:    buffer_load_dword v79, off, s[0:3], s33 offset:96
+; WAVE32-NEXT:    buffer_load_dword v78, off, s[0:3], s33 offset:100
+; WAVE32-NEXT:    buffer_load_dword v77, off, s[0:3], s33 offset:104
+; WAVE32-NEXT:    buffer_load_dword v76, off, s[0:3], s33 offset:108
+; WAVE32-NEXT:    buffer_load_dword v75, off, s[0:3], s33 offset:112
+; WAVE32-NEXT:    buffer_load_dword v74, off, s[0:3], s33 offset:116
+; WAVE32-NEXT:    buffer_load_dword v73, off, s[0:3], s33 offset:120
+; WAVE32-NEXT:    buffer_load_dword v72, off, s[0:3], s33 offset:124
+; WAVE32-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:128
+; WAVE32-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:132
+; WAVE32-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:136
+; WAVE32-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:140
+; WAVE32-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:144
+; WAVE32-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:148
+; WAVE32-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:152
+; WAVE32-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:156
+; WAVE32-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:160
+; WAVE32-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:164
+; WAVE32-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:168
+; WAVE32-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:172
+; WAVE32-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:176
+; WAVE32-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:180
+; WAVE32-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:184
+; WAVE32-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:188
+; WAVE32-NEXT:    v_readlane_b32 s97, v39, 31
+; WAVE32-NEXT:    v_readlane_b32 s96, v39, 30
+; WAVE32-NEXT:    v_readlane_b32 s87, v39, 29
+; WAVE32-NEXT:    v_readlane_b32 s86, v39, 28
+; WAVE32-NEXT:    v_readlane_b32 s85, v39, 27
+; WAVE32-NEXT:    v_readlane_b32 s84, v39, 26
+; WAVE32-NEXT:    v_readlane_b32 s83, v39, 25
+; WAVE32-NEXT:    v_readlane_b32 s82, v39, 24
+; WAVE32-NEXT:    v_readlane_b32 s81, v39, 23
+; WAVE32-NEXT:    v_readlane_b32 s80, v39, 22
+; WAVE32-NEXT:    v_readlane_b32 s71, v39, 21
+; WAVE32-NEXT:    v_readlane_b32 s70, v39, 20
+; WAVE32-NEXT:    v_readlane_b32 s69, v39, 19
+; WAVE32-NEXT:    v_readlane_b32 s68, v39, 18
+; WAVE32-NEXT:    v_readlane_b32 s67, v39, 17
+; WAVE32-NEXT:    v_readlane_b32 s66, v39, 16
+; WAVE32-NEXT:    v_readlane_b32 s65, v39, 15
+; WAVE32-NEXT:    v_readlane_b32 s64, v39, 14
+; WAVE32-NEXT:    v_readlane_b32 s55, v39, 13
+; WAVE32-NEXT:    v_readlane_b32 s54, v39, 12
+; WAVE32-NEXT:    v_readlane_b32 s53, v39, 11
+; WAVE32-NEXT:    v_readlane_b32 s52, v39, 10
+; WAVE32-NEXT:    v_readlane_b32 s51, v39, 9
+; WAVE32-NEXT:    v_readlane_b32 s50, v39, 8
+; WAVE32-NEXT:    v_readlane_b32 s49, v39, 7
+; WAVE32-NEXT:    v_readlane_b32 s48, v39, 6
+; WAVE32-NEXT:    v_readlane_b32 s39, v39, 5
+; WAVE32-NEXT:    v_readlane_b32 s38, v39, 4
+; WAVE32-NEXT:    v_readlane_b32 s37, v39, 3
+; WAVE32-NEXT:    v_readlane_b32 s36, v39, 2
+; WAVE32-NEXT:    v_readlane_b32 s35, v39, 1
+; WAVE32-NEXT:    v_readlane_b32 s34, v39, 0
+; WAVE32-NEXT:    s_mov_b32 s32, s33
+; WAVE32-NEXT:    s_xor_saveexec_b32 s4, -1
+; WAVE32-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:192 ; 4-byte Folded Reload
+; WAVE32-NEXT:    s_waitcnt_depctr 0xffe3
+; WAVE32-NEXT:    s_mov_b32 exec_lo, s4
+; WAVE32-NEXT:    .cfi_def_cfa_register 64
+; WAVE32-NEXT:    s_mov_b32 s33, s40
+; WAVE32-NEXT:    s_waitcnt vmcnt(0)
+; WAVE32-NEXT:    s_setpc_b64 s[30:31]
+  call void asm sideeffect "; clobber nonpreserved and 32 CSR SGPRs",
+    "~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
+    ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
+    ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
+    ,~{s34},~{s35},~{s36},~{s37},~{s38},~{s39}
+    ,~{s48},~{s49},~{s50},~{s51},~{s52},~{s53},~{s54},~{s55},~{s64},~{s65}
+    ,~{s66},~{s67},~{s68},~{s69},~{s70},~{s71},~{s80},~{s81},~{s82},~{s83}
+    ,~{s84},~{s85},~{s86},~{s87},~{s96},~{s97}
+    ,~{vcc}"()
+
+  call void asm sideeffect "; clobber all VGPRs except v39",
+    "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
+    ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
+    ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
+    ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38}
+    ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49}
+    ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59}
+    ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69}
+    ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79}
+    ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89}
+    ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99}
+    ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109}
+    ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119}
+    ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129}"()
+  ret void
+}
+
+define internal void @caller_needs_to_spill_pc_to_memory() #3 {
+; WAVE64-LABEL: caller_needs_to_spill_pc_to_memory:
+; WAVE64:       .Lfunc_begin6:
+; WAVE64-NEXT:    .cfi_startproc
+; WAVE64-NEXT:  ; %bb.0:
+; WAVE64-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; WAVE64-NEXT:    .cfi_llvm_register_pair 16, 62, 32, 63, 32
+; WAVE64-NEXT:    .cfi_undefined 2560
+; WAVE64-NEXT:    .cfi_undefined 2561
+; WAVE64-NEXT:    .cfi_undefined 2562
+; WAVE64-NEXT:    .cfi_undefined 2563
+; WAVE64-NEXT:    .cfi_undefined 2564
+; WAVE64-NEXT:    .cfi_undefined 2565
+; WAVE64-NEXT:    .cfi_undefined 2566
+; WAVE64-NEXT:    .cfi_undefined 2567
+; WAVE64-NEXT:    .cfi_undefined 2568
+; WAVE64-NEXT:    .cfi_undefined 2569
+; WAVE64-NEXT:    .cfi_undefined 2570
+; WAVE64-NEXT:    .cfi_undefined 2571
+; WAVE64-NEXT:    .cfi_undefined 2572
+; WAVE64-NEXT:    .cfi_undefined 2573
+; WAVE64-NEXT:    .cfi_undefined 2574
+; WAVE64-NEXT:    .cfi_undefined 2575
+; WAVE64-NEXT:    .cfi_undefined 2576
+; WAVE64-NEXT:    .cfi_undefined 2577
+; WAVE64-NEXT:    .cfi_undefined 2578
+; WAVE64-NEXT:    .cfi_undefined 2579
+; WAVE64-NEXT:    .cfi_undefined 2580
+; WAVE64-NEXT:    .cfi_undefined 2581
+; WAVE64-NEXT:    .cfi_undefined 2582
+; WAVE64-NEXT:    .cfi_undefined 2583
+; WAVE64-NEXT:    .cfi_undefined 2584
+; WAVE64-NEXT:    .cfi_undefined 2585
+; WAVE64-NEXT:    .cfi_undefined 2586
+; WAVE64-NEXT:    .cfi_undefined 2587
+; WAVE64-NEXT:    .cfi_undefined 2588
+; WAVE64-NEXT:    .cfi_undefined 2589
+; WAVE64-NEXT:    .cfi_undefined 2590
+; WAVE64-NEXT:    .cfi_undefined 2591
+; WAVE64-NEXT:    .cfi_undefined 2592
+; WAVE64-NEXT:    .cfi_undefined 2593
+; WAVE64-NEXT:    .cfi_undefined 2594
+; WAVE64-NEXT:    .cfi_undefined 2595
+; WAVE64-NEXT:    .cfi_undefined 2596
+; WAVE64-NEXT:    .cfi_undefined 2597
+; WAVE64-NEXT:    .cfi_undefined 2598
+; WAVE64-NEXT:    .cfi_undefined 2599
+; WAVE64-NEXT:    .cfi_undefined 2608
+; WAVE64-NEXT:    .cfi_undefined 2609
+; WAVE64-NEXT:    .cfi_undefined 2610
+; WAVE64-NEXT:    .cfi_undefined 2611
+; WAVE64-NEXT:    .cfi_undefined 2612
+; WAVE64-NEXT:    .cfi_undefined 2613
+; WAVE64-NEXT:    .cfi_undefined 2614
+; WAVE64-NEXT:    .cfi_undefined 2615
+; WAVE64-NEXT:    .cfi_undefined 2624
+; WAVE64-NEXT:    .cfi_undefined 2625
+; WAVE64-NEXT:    .cfi_undefined 2626
+; WAVE64-NEXT:    .cfi_undefined 2627
+; WAVE64-NEXT:    .cfi_undefined 2628
+; WAVE64-NEXT:    .cfi_undefined 2629
+; WAVE64-NEXT:    .cfi_undefined 2630
+; WAVE64-NEXT:    .cfi_undefined 2631
+; WAVE64-NEXT:    .cfi_undefined 2640
+; WAVE64-NEXT:    .cfi_undefined 2641
+; WAVE64-NEXT:    .cfi_undefined 2642
+; WAVE64-NEXT:    .cfi_undefined 2643
+; WAVE64-NEXT:    .cfi_undefined 2644
+; WAVE64-NEXT:    .cfi_undefined 2645
+; WAVE64-NEXT:    .cfi_undefined 2646
+; WAVE64-NEXT:    .cfi_undefined 2647
+; WAVE64-NEXT:    .cfi_undefined 2656
+; WAVE64-NEXT:    .cfi_undefined 2657
+; WAVE64-NEXT:    .cfi_undefined 2658
+; WAVE64-NEXT:    .cfi_undefined 2659
+; WAVE64-NEXT:    .cfi_undefined 2660
+; WAVE64-NEXT:    .cfi_undefined 2661
+; WAVE64-NEXT:    .cfi_undefined 2662
+; WAVE64-NEXT:    .cfi_undefined 2663
+; WAVE64-NEXT:    .cfi_undefined 2672
+; WAVE64-NEXT:    .cfi_undefined 2673
+; WAVE64-NEXT:    .cfi_undefined 2674
+; WAVE64-NEXT:    .cfi_undefined 2675
+; WAVE64-NEXT:    .cfi_undefined 2676
+; WAVE64-NEXT:    .cfi_undefined 2677
+; WAVE64-NEXT:    .cfi_undefined 2678
+; WAVE64-NEXT:    .cfi_undefined 2679
+; WAVE64-NEXT:    .cfi_undefined 2688
+; WAVE64-NEXT:    .cfi_undefined 2689
+; WAVE64-NEXT:    .cfi_undefined 2690
+; WAVE64-NEXT:    .cfi_undefined 2691
+; WAVE64-NEXT:    .cfi_undefined 2692
+; WAVE64-NEXT:    .cfi_undefined 2693
+; WAVE64-NEXT:    .cfi_undefined 2694
+; WAVE64-NEXT:    .cfi_undefined 2695
+; WAVE64-NEXT:    .cfi_undefined 2704
+; WAVE64-NEXT:    .cfi_undefined 2705
+; WAVE64-NEXT:    .cfi_undefined 2706
+; WAVE64-NEXT:    .cfi_undefined 2707
+; WAVE64-NEXT:    .cfi_undefined 2708
+; WAVE64-NEXT:    .cfi_undefined 2709
+; WAVE64-NEXT:    .cfi_undefined 2710
+; WAVE64-NEXT:    .cfi_undefined 2711
+; WAVE64-NEXT:    .cfi_undefined 2720
+; WAVE64-NEXT:    .cfi_undefined 2721
+; WAVE64-NEXT:    .cfi_undefined 2722
+; WAVE64-NEXT:    .cfi_undefined 2723
+; WAVE64-NEXT:    .cfi_undefined 2724
+; WAVE64-NEXT:    .cfi_undefined 2725
+; WAVE64-NEXT:    .cfi_undefined 2726
+; WAVE64-NEXT:    .cfi_undefined 2727
+; WAVE64-NEXT:    .cfi_undefined 2736
+; WAVE64-NEXT:    .cfi_undefined 2737
+; WAVE64-NEXT:    .cfi_undefined 2738
+; WAVE64-NEXT:    .cfi_undefined 2739
+; WAVE64-NEXT:    .cfi_undefined 2740
+; WAVE64-NEXT:    .cfi_undefined 2741
+; WAVE64-NEXT:    .cfi_undefined 2742
+; WAVE64-NEXT:    .cfi_undefined 2743
+; WAVE64-NEXT:    .cfi_undefined 2752
+; WAVE64-NEXT:    .cfi_undefined 2753
+; WAVE64-NEXT:    .cfi_undefined 2754
+; WAVE64-NEXT:    .cfi_undefined 2755
+; WAVE64-NEXT:    .cfi_undefined 2756
+; WAVE64-NEXT:    .cfi_undefined 2757
+; WAVE64-NEXT:    .cfi_undefined 2758
+; WAVE64-NEXT:    .cfi_undefined 2759
+; WAVE64-NEXT:    .cfi_undefined 2768
+; WAVE64-NEXT:    .cfi_undefined 2769
+; WAVE64-NEXT:    .cfi_undefined 2770
+; WAVE64-NEXT:    .cfi_undefined 2771
+; WAVE64-NEXT:    .cfi_undefined 2772
+; WAVE64-NEXT:    .cfi_undefined 2773
+; WAVE64-NEXT:    .cfi_undefined 2774
+; WAVE64-NEXT:    .cfi_undefined 2775
+; WAVE64-NEXT:    .cfi_undefined 2784
+; WAVE64-NEXT:    .cfi_undefined 2785
+; WAVE64-NEXT:    .cfi_undefined 2786
+; WAVE64-NEXT:    .cfi_undefined 2787
+; WAVE64-NEXT:    .cfi_undefined 2788
+; WAVE64-NEXT:    .cfi_undefined 2789
+; WAVE64-NEXT:    .cfi_undefined 2790
+; WAVE64-NEXT:    .cfi_undefined 2791
+; WAVE64-NEXT:    .cfi_undefined 2800
+; WAVE64-NEXT:    .cfi_undefined 2801
+; WAVE64-NEXT:    .cfi_undefined 2802
+; WAVE64-NEXT:    .cfi_undefined 2803
+; WAVE64-NEXT:    .cfi_undefined 2804
+; WAVE64-NEXT:    .cfi_undefined 2805
+; WAVE64-NEXT:    .cfi_undefined 2806
+; WAVE64-NEXT:    .cfi_undefined 2807
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE64-NEXT:    v_mov_b32_e32 v0, exec_lo
+; WAVE64-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE64-NEXT:    v_mov_b32_e32 v0, exec_hi
+; WAVE64-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_offset 17, 0
+; WAVE64-NEXT:    ;;#ASMSTART
+; WAVE64-NEXT:    ; clobber all VGPRs
+; WAVE64-NEXT:    ;;#ASMEND
+; WAVE64-NEXT:    s_waitcnt vmcnt(0)
+; WAVE64-NEXT:    s_setpc_b64 s[30:31]
+;
+; WAVE32-LABEL: caller_needs_to_spill_pc_to_memory:
+; WAVE32:       .Lfunc_begin6:
+; WAVE32-NEXT:    .cfi_startproc
+; WAVE32-NEXT:  ; %bb.0:
+; WAVE32-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; WAVE32-NEXT:    .cfi_llvm_register_pair 16, 62, 32, 63, 32
+; WAVE32-NEXT:    .cfi_undefined 1536
+; WAVE32-NEXT:    .cfi_undefined 1537
+; WAVE32-NEXT:    .cfi_undefined 1538
+; WAVE32-NEXT:    .cfi_undefined 1539
+; WAVE32-NEXT:    .cfi_undefined 1540
+; WAVE32-NEXT:    .cfi_undefined 1541
+; WAVE32-NEXT:    .cfi_undefined 1542
+; WAVE32-NEXT:    .cfi_undefined 1543
+; WAVE32-NEXT:    .cfi_undefined 1544
+; WAVE32-NEXT:    .cfi_undefined 1545
+; WAVE32-NEXT:    .cfi_undefined 1546
+; WAVE32-NEXT:    .cfi_undefined 1547
+; WAVE32-NEXT:    .cfi_undefined 1548
+; WAVE32-NEXT:    .cfi_undefined 1549
+; WAVE32-NEXT:    .cfi_undefined 1550
+; WAVE32-NEXT:    .cfi_undefined 1551
+; WAVE32-NEXT:    .cfi_undefined 1552
+; WAVE32-NEXT:    .cfi_undefined 1553
+; WAVE32-NEXT:    .cfi_undefined 1554
+; WAVE32-NEXT:    .cfi_undefined 1555
+; WAVE32-NEXT:    .cfi_undefined 1556
+; WAVE32-NEXT:    .cfi_undefined 1557
+; WAVE32-NEXT:    .cfi_undefined 1558
+; WAVE32-NEXT:    .cfi_undefined 1559
+; WAVE32-NEXT:    .cfi_undefined 1560
+; WAVE32-NEXT:    .cfi_undefined 1561
+; WAVE32-NEXT:    .cfi_undefined 1562
+; WAVE32-NEXT:    .cfi_undefined 1563
+; WAVE32-NEXT:    .cfi_undefined 1564
+; WAVE32-NEXT:    .cfi_undefined 1565
+; WAVE32-NEXT:    .cfi_undefined 1566
+; WAVE32-NEXT:    .cfi_undefined 1567
+; WAVE32-NEXT:    .cfi_undefined 1568
+; WAVE32-NEXT:    .cfi_undefined 1569
+; WAVE32-NEXT:    .cfi_undefined 1570
+; WAVE32-NEXT:    .cfi_undefined 1571
+; WAVE32-NEXT:    .cfi_undefined 1572
+; WAVE32-NEXT:    .cfi_undefined 1573
+; WAVE32-NEXT:    .cfi_undefined 1574
+; WAVE32-NEXT:    .cfi_undefined 1575
+; WAVE32-NEXT:    .cfi_undefined 1584
+; WAVE32-NEXT:    .cfi_undefined 1585
+; WAVE32-NEXT:    .cfi_undefined 1586
+; WAVE32-NEXT:    .cfi_undefined 1587
+; WAVE32-NEXT:    .cfi_undefined 1588
+; WAVE32-NEXT:    .cfi_undefined 1589
+; WAVE32-NEXT:    .cfi_undefined 1590
+; WAVE32-NEXT:    .cfi_undefined 1591
+; WAVE32-NEXT:    .cfi_undefined 1600
+; WAVE32-NEXT:    .cfi_undefined 1601
+; WAVE32-NEXT:    .cfi_undefined 1602
+; WAVE32-NEXT:    .cfi_undefined 1603
+; WAVE32-NEXT:    .cfi_undefined 1604
+; WAVE32-NEXT:    .cfi_undefined 1605
+; WAVE32-NEXT:    .cfi_undefined 1606
+; WAVE32-NEXT:    .cfi_undefined 1607
+; WAVE32-NEXT:    .cfi_undefined 1616
+; WAVE32-NEXT:    .cfi_undefined 1617
+; WAVE32-NEXT:    .cfi_undefined 1618
+; WAVE32-NEXT:    .cfi_undefined 1619
+; WAVE32-NEXT:    .cfi_undefined 1620
+; WAVE32-NEXT:    .cfi_undefined 1621
+; WAVE32-NEXT:    .cfi_undefined 1622
+; WAVE32-NEXT:    .cfi_undefined 1623
+; WAVE32-NEXT:    .cfi_undefined 1632
+; WAVE32-NEXT:    .cfi_undefined 1633
+; WAVE32-NEXT:    .cfi_undefined 1634
+; WAVE32-NEXT:    .cfi_undefined 1635
+; WAVE32-NEXT:    .cfi_undefined 1636
+; WAVE32-NEXT:    .cfi_undefined 1637
+; WAVE32-NEXT:    .cfi_undefined 1638
+; WAVE32-NEXT:    .cfi_undefined 1639
+; WAVE32-NEXT:    .cfi_undefined 1648
+; WAVE32-NEXT:    .cfi_undefined 1649
+; WAVE32-NEXT:    .cfi_undefined 1650
+; WAVE32-NEXT:    .cfi_undefined 1651
+; WAVE32-NEXT:    .cfi_undefined 1652
+; WAVE32-NEXT:    .cfi_undefined 1653
+; WAVE32-NEXT:    .cfi_undefined 1654
+; WAVE32-NEXT:    .cfi_undefined 1655
+; WAVE32-NEXT:    .cfi_undefined 1664
+; WAVE32-NEXT:    .cfi_undefined 1665
+; WAVE32-NEXT:    .cfi_undefined 1666
+; WAVE32-NEXT:    .cfi_undefined 1667
+; WAVE32-NEXT:    .cfi_undefined 1668
+; WAVE32-NEXT:    .cfi_undefined 1669
+; WAVE32-NEXT:    .cfi_undefined 1670
+; WAVE32-NEXT:    .cfi_undefined 1671
+; WAVE32-NEXT:    .cfi_undefined 1680
+; WAVE32-NEXT:    .cfi_undefined 1681
+; WAVE32-NEXT:    .cfi_undefined 1682
+; WAVE32-NEXT:    .cfi_undefined 1683
+; WAVE32-NEXT:    .cfi_undefined 1684
+; WAVE32-NEXT:    .cfi_undefined 1685
+; WAVE32-NEXT:    .cfi_undefined 1686
+; WAVE32-NEXT:    .cfi_undefined 1687
+; WAVE32-NEXT:    .cfi_undefined 1696
+; WAVE32-NEXT:    .cfi_undefined 1697
+; WAVE32-NEXT:    .cfi_undefined 1698
+; WAVE32-NEXT:    .cfi_undefined 1699
+; WAVE32-NEXT:    .cfi_undefined 1700
+; WAVE32-NEXT:    .cfi_undefined 1701
+; WAVE32-NEXT:    .cfi_undefined 1702
+; WAVE32-NEXT:    .cfi_undefined 1703
+; WAVE32-NEXT:    .cfi_undefined 1712
+; WAVE32-NEXT:    .cfi_undefined 1713
+; WAVE32-NEXT:    .cfi_undefined 1714
+; WAVE32-NEXT:    .cfi_undefined 1715
+; WAVE32-NEXT:    .cfi_undefined 1716
+; WAVE32-NEXT:    .cfi_undefined 1717
+; WAVE32-NEXT:    .cfi_undefined 1718
+; WAVE32-NEXT:    .cfi_undefined 1719
+; WAVE32-NEXT:    .cfi_undefined 1728
+; WAVE32-NEXT:    .cfi_undefined 1729
+; WAVE32-NEXT:    .cfi_undefined 1730
+; WAVE32-NEXT:    .cfi_undefined 1731
+; WAVE32-NEXT:    .cfi_undefined 1732
+; WAVE32-NEXT:    .cfi_undefined 1733
+; WAVE32-NEXT:    .cfi_undefined 1734
+; WAVE32-NEXT:    .cfi_undefined 1735
+; WAVE32-NEXT:    .cfi_undefined 1744
+; WAVE32-NEXT:    .cfi_undefined 1745
+; WAVE32-NEXT:    .cfi_undefined 1746
+; WAVE32-NEXT:    .cfi_undefined 1747
+; WAVE32-NEXT:    .cfi_undefined 1748
+; WAVE32-NEXT:    .cfi_undefined 1749
+; WAVE32-NEXT:    .cfi_undefined 1750
+; WAVE32-NEXT:    .cfi_undefined 1751
+; WAVE32-NEXT:    .cfi_undefined 1760
+; WAVE32-NEXT:    .cfi_undefined 1761
+; WAVE32-NEXT:    .cfi_undefined 1762
+; WAVE32-NEXT:    .cfi_undefined 1763
+; WAVE32-NEXT:    .cfi_undefined 1764
+; WAVE32-NEXT:    .cfi_undefined 1765
+; WAVE32-NEXT:    .cfi_undefined 1766
+; WAVE32-NEXT:    .cfi_undefined 1767
+; WAVE32-NEXT:    .cfi_undefined 1776
+; WAVE32-NEXT:    .cfi_undefined 1777
+; WAVE32-NEXT:    .cfi_undefined 1778
+; WAVE32-NEXT:    .cfi_undefined 1779
+; WAVE32-NEXT:    .cfi_undefined 1780
+; WAVE32-NEXT:    .cfi_undefined 1781
+; WAVE32-NEXT:    .cfi_undefined 1782
+; WAVE32-NEXT:    .cfi_undefined 1783
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE32-NEXT:    v_mov_b32_e32 v0, exec_lo
+; WAVE32-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_offset 1, 0
+; WAVE32-NEXT:    ;;#ASMSTART
+; WAVE32-NEXT:    ; clobber all VGPRs
+; WAVE32-NEXT:    ;;#ASMEND
+; WAVE32-NEXT:    s_setpc_b64 s[30:31]
+  call void asm sideeffect "; clobber all VGPRs",
+  "~{v0},~{v1},~{v2},~{v3},~{v4},~{v5},~{v6},~{v7},~{v8},~{v9}
+  ,~{v10},~{v11},~{v12},~{v13},~{v14},~{v15},~{v16},~{v17},~{v18},~{v19}
+  ,~{v20},~{v21},~{v22},~{v23},~{v24},~{v25},~{v26},~{v27},~{v28},~{v29}
+  ,~{v30},~{v31},~{v32},~{v33},~{v34},~{v35},~{v36},~{v37},~{v38},~{v39}
+  ,~{v40},~{v41},~{v42},~{v43},~{v44},~{v45},~{v46},~{v47},~{v48},~{v49}
+  ,~{v50},~{v51},~{v52},~{v53},~{v54},~{v55},~{v56},~{v57},~{v58},~{v59}
+  ,~{v60},~{v61},~{v62},~{v63},~{v64},~{v65},~{v66},~{v67},~{v68},~{v69}
+  ,~{v70},~{v71},~{v72},~{v73},~{v74},~{v75},~{v76},~{v77},~{v78},~{v79}
+  ,~{v80},~{v81},~{v82},~{v83},~{v84},~{v85},~{v86},~{v87},~{v88},~{v89}
+  ,~{v90},~{v91},~{v92},~{v93},~{v94},~{v95},~{v96},~{v97},~{v98},~{v99}
+  ,~{v100},~{v101},~{v102},~{v103},~{v104},~{v105},~{v106},~{v107},~{v108},~{v109}
+  ,~{v110},~{v111},~{v112},~{v113},~{v114},~{v115},~{v116},~{v117},~{v118},~{v119}
+  ,~{v120},~{v121},~{v122},~{v123},~{v124},~{v125},~{v126},~{v127},~{v128},~{v129}
+  ,~{v130},~{v131},~{v132},~{v133},~{v134},~{v135},~{v136},~{v137},~{v138},~{v139}
+  ,~{v140},~{v141},~{v142},~{v143},~{v144},~{v145},~{v146},~{v147},~{v148},~{v149}
+  ,~{v150},~{v151},~{v152},~{v153},~{v154},~{v155},~{v156},~{v157},~{v158},~{v159}
+  ,~{v160},~{v161},~{v162},~{v163},~{v164},~{v165},~{v166},~{v167},~{v168},~{v169}
+  ,~{v170},~{v171},~{v172},~{v173},~{v174},~{v175},~{v176},~{v177},~{v178},~{v179}
+  ,~{v180},~{v181},~{v182},~{v183},~{v184},~{v185},~{v186},~{v187},~{v188},~{v189}
+  ,~{v190},~{v191},~{v192},~{v193},~{v194},~{v195},~{v196},~{v197},~{v198},~{v199}
+  ,~{v200},~{v201},~{v202},~{v203},~{v204},~{v205},~{v206},~{v207},~{v208},~{v209}
+  ,~{v210},~{v211},~{v212},~{v213},~{v214},~{v215},~{v216},~{v217},~{v218},~{v219}
+  ,~{v220},~{v221},~{v222},~{v223},~{v224},~{v225},~{v226},~{v227},~{v228},~{v229}
+  ,~{v230},~{v231},~{v232},~{v233},~{v234},~{v235},~{v236},~{v237},~{v238},~{v239}
+  ,~{v240},~{v241},~{v242},~{v243},~{v244},~{v245},~{v246},~{v247},~{v248},~{v249}
+  ,~{v250},~{v251},~{v252},~{v253},~{v254},~{v255}" () #3
+  ret void
+}
+
+define void @need_to_spill_pc_to_mem() #3 {
+; WAVE64-LABEL: need_to_spill_pc_to_mem:
+; WAVE64:       .Lfunc_begin7:
+; WAVE64-NEXT:    .cfi_startproc
+; WAVE64-NEXT:  ; %bb.0:
+; WAVE64-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; WAVE64-NEXT:    .cfi_llvm_register_pair 16, 62, 32, 63, 32
+; WAVE64-NEXT:    .cfi_undefined 2560
+; WAVE64-NEXT:    .cfi_undefined 2561
+; WAVE64-NEXT:    .cfi_undefined 2562
+; WAVE64-NEXT:    .cfi_undefined 2563
+; WAVE64-NEXT:    .cfi_undefined 2564
+; WAVE64-NEXT:    .cfi_undefined 2565
+; WAVE64-NEXT:    .cfi_undefined 2566
+; WAVE64-NEXT:    .cfi_undefined 2567
+; WAVE64-NEXT:    .cfi_undefined 2568
+; WAVE64-NEXT:    .cfi_undefined 2569
+; WAVE64-NEXT:    .cfi_undefined 2570
+; WAVE64-NEXT:    .cfi_undefined 2571
+; WAVE64-NEXT:    .cfi_undefined 2572
+; WAVE64-NEXT:    .cfi_undefined 2573
+; WAVE64-NEXT:    .cfi_undefined 2574
+; WAVE64-NEXT:    .cfi_undefined 2575
+; WAVE64-NEXT:    .cfi_undefined 2576
+; WAVE64-NEXT:    .cfi_undefined 2577
+; WAVE64-NEXT:    .cfi_undefined 2578
+; WAVE64-NEXT:    .cfi_undefined 2579
+; WAVE64-NEXT:    .cfi_undefined 2580
+; WAVE64-NEXT:    .cfi_undefined 2581
+; WAVE64-NEXT:    .cfi_undefined 2582
+; WAVE64-NEXT:    .cfi_undefined 2583
+; WAVE64-NEXT:    .cfi_undefined 2584
+; WAVE64-NEXT:    .cfi_undefined 2585
+; WAVE64-NEXT:    .cfi_undefined 2586
+; WAVE64-NEXT:    .cfi_undefined 2587
+; WAVE64-NEXT:    .cfi_undefined 2588
+; WAVE64-NEXT:    .cfi_undefined 2589
+; WAVE64-NEXT:    .cfi_undefined 2590
+; WAVE64-NEXT:    .cfi_undefined 2591
+; WAVE64-NEXT:    .cfi_undefined 2592
+; WAVE64-NEXT:    .cfi_undefined 2593
+; WAVE64-NEXT:    .cfi_undefined 2594
+; WAVE64-NEXT:    .cfi_undefined 2595
+; WAVE64-NEXT:    .cfi_undefined 2596
+; WAVE64-NEXT:    .cfi_undefined 2597
+; WAVE64-NEXT:    .cfi_undefined 2598
+; WAVE64-NEXT:    .cfi_undefined 2599
+; WAVE64-NEXT:    .cfi_undefined 2608
+; WAVE64-NEXT:    .cfi_undefined 2609
+; WAVE64-NEXT:    .cfi_undefined 2610
+; WAVE64-NEXT:    .cfi_undefined 2611
+; WAVE64-NEXT:    .cfi_undefined 2612
+; WAVE64-NEXT:    .cfi_undefined 2613
+; WAVE64-NEXT:    .cfi_undefined 2614
+; WAVE64-NEXT:    .cfi_undefined 2615
+; WAVE64-NEXT:    .cfi_undefined 2624
+; WAVE64-NEXT:    .cfi_undefined 2625
+; WAVE64-NEXT:    .cfi_undefined 2626
+; WAVE64-NEXT:    .cfi_undefined 2627
+; WAVE64-NEXT:    .cfi_undefined 2628
+; WAVE64-NEXT:    .cfi_undefined 2629
+; WAVE64-NEXT:    .cfi_undefined 2630
+; WAVE64-NEXT:    .cfi_undefined 2631
+; WAVE64-NEXT:    .cfi_undefined 2640
+; WAVE64-NEXT:    .cfi_undefined 2641
+; WAVE64-NEXT:    .cfi_undefined 2642
+; WAVE64-NEXT:    .cfi_undefined 2643
+; WAVE64-NEXT:    .cfi_undefined 2644
+; WAVE64-NEXT:    .cfi_undefined 2645
+; WAVE64-NEXT:    .cfi_undefined 2646
+; WAVE64-NEXT:    .cfi_undefined 2647
+; WAVE64-NEXT:    .cfi_undefined 2656
+; WAVE64-NEXT:    .cfi_undefined 2657
+; WAVE64-NEXT:    .cfi_undefined 2658
+; WAVE64-NEXT:    .cfi_undefined 2659
+; WAVE64-NEXT:    .cfi_undefined 2660
+; WAVE64-NEXT:    .cfi_undefined 2661
+; WAVE64-NEXT:    .cfi_undefined 2662
+; WAVE64-NEXT:    .cfi_undefined 2663
+; WAVE64-NEXT:    .cfi_undefined 2672
+; WAVE64-NEXT:    .cfi_undefined 2673
+; WAVE64-NEXT:    .cfi_undefined 2674
+; WAVE64-NEXT:    .cfi_undefined 2675
+; WAVE64-NEXT:    .cfi_undefined 2676
+; WAVE64-NEXT:    .cfi_undefined 2677
+; WAVE64-NEXT:    .cfi_undefined 2678
+; WAVE64-NEXT:    .cfi_undefined 2679
+; WAVE64-NEXT:    .cfi_undefined 2688
+; WAVE64-NEXT:    .cfi_undefined 2689
+; WAVE64-NEXT:    .cfi_undefined 2690
+; WAVE64-NEXT:    .cfi_undefined 2691
+; WAVE64-NEXT:    .cfi_undefined 2692
+; WAVE64-NEXT:    .cfi_undefined 2693
+; WAVE64-NEXT:    .cfi_undefined 2694
+; WAVE64-NEXT:    .cfi_undefined 2695
+; WAVE64-NEXT:    .cfi_undefined 2704
+; WAVE64-NEXT:    .cfi_undefined 2705
+; WAVE64-NEXT:    .cfi_undefined 2706
+; WAVE64-NEXT:    .cfi_undefined 2707
+; WAVE64-NEXT:    .cfi_undefined 2708
+; WAVE64-NEXT:    .cfi_undefined 2709
+; WAVE64-NEXT:    .cfi_undefined 2710
+; WAVE64-NEXT:    .cfi_undefined 2711
+; WAVE64-NEXT:    .cfi_undefined 2720
+; WAVE64-NEXT:    .cfi_undefined 2721
+; WAVE64-NEXT:    .cfi_undefined 2722
+; WAVE64-NEXT:    .cfi_undefined 2723
+; WAVE64-NEXT:    .cfi_undefined 2724
+; WAVE64-NEXT:    .cfi_undefined 2725
+; WAVE64-NEXT:    .cfi_undefined 2726
+; WAVE64-NEXT:    .cfi_undefined 2727
+; WAVE64-NEXT:    .cfi_undefined 2736
+; WAVE64-NEXT:    .cfi_undefined 2737
+; WAVE64-NEXT:    .cfi_undefined 2738
+; WAVE64-NEXT:    .cfi_undefined 2739
+; WAVE64-NEXT:    .cfi_undefined 2740
+; WAVE64-NEXT:    .cfi_undefined 2741
+; WAVE64-NEXT:    .cfi_undefined 2742
+; WAVE64-NEXT:    .cfi_undefined 2743
+; WAVE64-NEXT:    .cfi_undefined 2752
+; WAVE64-NEXT:    .cfi_undefined 2753
+; WAVE64-NEXT:    .cfi_undefined 2754
+; WAVE64-NEXT:    .cfi_undefined 2755
+; WAVE64-NEXT:    .cfi_undefined 2756
+; WAVE64-NEXT:    .cfi_undefined 2757
+; WAVE64-NEXT:    .cfi_undefined 2758
+; WAVE64-NEXT:    .cfi_undefined 2759
+; WAVE64-NEXT:    .cfi_undefined 2768
+; WAVE64-NEXT:    .cfi_undefined 2769
+; WAVE64-NEXT:    .cfi_undefined 2770
+; WAVE64-NEXT:    .cfi_undefined 2771
+; WAVE64-NEXT:    .cfi_undefined 2772
+; WAVE64-NEXT:    .cfi_undefined 2773
+; WAVE64-NEXT:    .cfi_undefined 2774
+; WAVE64-NEXT:    .cfi_undefined 2775
+; WAVE64-NEXT:    .cfi_undefined 2784
+; WAVE64-NEXT:    .cfi_undefined 2785
+; WAVE64-NEXT:    .cfi_undefined 2786
+; WAVE64-NEXT:    .cfi_undefined 2787
+; WAVE64-NEXT:    .cfi_undefined 2788
+; WAVE64-NEXT:    .cfi_undefined 2789
+; WAVE64-NEXT:    .cfi_undefined 2790
+; WAVE64-NEXT:    .cfi_undefined 2791
+; WAVE64-NEXT:    .cfi_undefined 2800
+; WAVE64-NEXT:    .cfi_undefined 2801
+; WAVE64-NEXT:    .cfi_undefined 2802
+; WAVE64-NEXT:    .cfi_undefined 2803
+; WAVE64-NEXT:    .cfi_undefined 2804
+; WAVE64-NEXT:    .cfi_undefined 2805
+; WAVE64-NEXT:    .cfi_undefined 2806
+; WAVE64-NEXT:    .cfi_undefined 2807
+; WAVE64-NEXT:    .cfi_undefined 48
+; WAVE64-NEXT:    .cfi_undefined 49
+; WAVE64-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE64-NEXT:    s_mov_b32 s18, s33
+; WAVE64-NEXT:    .cfi_register 65, 50
+; WAVE64-NEXT:    s_mov_b32 s33, s32
+; WAVE64-NEXT:    v_mov_b32_e32 v0, exec_lo
+; WAVE64-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456 ; 4-byte Folded Spill
+; WAVE64-NEXT:    v_mov_b32_e32 v0, exec_hi
+; WAVE64-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:460 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_offset 17, 29184
+; WAVE64-NEXT:    .cfi_def_cfa_register 65
+; WAVE64-NEXT:    s_addk_i32 s32, 0x7800
+; WAVE64-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2600, 32, 17, 64, 28416
+; WAVE64-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2601, 32, 17, 64, 28160
+; WAVE64-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2602, 32, 17, 64, 27904
+; WAVE64-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:432 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2603, 32, 17, 64, 27648
+; WAVE64-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:428 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2604, 32, 17, 64, 27392
+; WAVE64-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:424 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2605, 32, 17, 64, 27136
+; WAVE64-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:420 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2606, 32, 17, 64, 26880
+; WAVE64-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:416 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2607, 32, 17, 64, 26624
+; WAVE64-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:412 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2616, 32, 17, 64, 26368
+; WAVE64-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:408 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2617, 32, 17, 64, 26112
+; WAVE64-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:404 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2618, 32, 17, 64, 25856
+; WAVE64-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:400 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2619, 32, 17, 64, 25600
+; WAVE64-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:396 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2620, 32, 17, 64, 25344
+; WAVE64-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:392 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2621, 32, 17, 64, 25088
+; WAVE64-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:388 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2622, 32, 17, 64, 24832
+; WAVE64-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:384 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2623, 32, 17, 64, 24576
+; WAVE64-NEXT:    buffer_store_dword v72, off, s[0:3], s33 offset:380 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2632, 32, 17, 64, 24320
+; WAVE64-NEXT:    buffer_store_dword v73, off, s[0:3], s33 offset:376 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2633, 32, 17, 64, 24064
+; WAVE64-NEXT:    buffer_store_dword v74, off, s[0:3], s33 offset:372 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2634, 32, 17, 64, 23808
+; WAVE64-NEXT:    buffer_store_dword v75, off, s[0:3], s33 offset:368 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2635, 32, 17, 64, 23552
+; WAVE64-NEXT:    buffer_store_dword v76, off, s[0:3], s33 offset:364 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2636, 32, 17, 64, 23296
+; WAVE64-NEXT:    buffer_store_dword v77, off, s[0:3], s33 offset:360 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2637, 32, 17, 64, 23040
+; WAVE64-NEXT:    buffer_store_dword v78, off, s[0:3], s33 offset:356 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2638, 32, 17, 64, 22784
+; WAVE64-NEXT:    buffer_store_dword v79, off, s[0:3], s33 offset:352 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2639, 32, 17, 64, 22528
+; WAVE64-NEXT:    buffer_store_dword v88, off, s[0:3], s33 offset:348 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2648, 32, 17, 64, 22272
+; WAVE64-NEXT:    buffer_store_dword v89, off, s[0:3], s33 offset:344 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2649, 32, 17, 64, 22016
+; WAVE64-NEXT:    buffer_store_dword v90, off, s[0:3], s33 offset:340 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2650, 32, 17, 64, 21760
+; WAVE64-NEXT:    buffer_store_dword v91, off, s[0:3], s33 offset:336 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2651, 32, 17, 64, 21504
+; WAVE64-NEXT:    buffer_store_dword v92, off, s[0:3], s33 offset:332 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2652, 32, 17, 64, 21248
+; WAVE64-NEXT:    buffer_store_dword v93, off, s[0:3], s33 offset:328 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2653, 32, 17, 64, 20992
+; WAVE64-NEXT:    buffer_store_dword v94, off, s[0:3], s33 offset:324 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2654, 32, 17, 64, 20736
+; WAVE64-NEXT:    buffer_store_dword v95, off, s[0:3], s33 offset:320 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2655, 32, 17, 64, 20480
+; WAVE64-NEXT:    buffer_store_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2664, 32, 17, 64, 20224
+; WAVE64-NEXT:    buffer_store_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2665, 32, 17, 64, 19968
+; WAVE64-NEXT:    buffer_store_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2666, 32, 17, 64, 19712
+; WAVE64-NEXT:    buffer_store_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2667, 32, 17, 64, 19456
+; WAVE64-NEXT:    buffer_store_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2668, 32, 17, 64, 19200
+; WAVE64-NEXT:    buffer_store_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2669, 32, 17, 64, 18944
+; WAVE64-NEXT:    buffer_store_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2670, 32, 17, 64, 18688
+; WAVE64-NEXT:    buffer_store_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2671, 32, 17, 64, 18432
+; WAVE64-NEXT:    buffer_store_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2680, 32, 17, 64, 18176
+; WAVE64-NEXT:    buffer_store_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2681, 32, 17, 64, 17920
+; WAVE64-NEXT:    buffer_store_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2682, 32, 17, 64, 17664
+; WAVE64-NEXT:    buffer_store_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2683, 32, 17, 64, 17408
+; WAVE64-NEXT:    buffer_store_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2684, 32, 17, 64, 17152
+; WAVE64-NEXT:    buffer_store_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2685, 32, 17, 64, 16896
+; WAVE64-NEXT:    buffer_store_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2686, 32, 17, 64, 16640
+; WAVE64-NEXT:    buffer_store_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2687, 32, 17, 64, 16384
+; WAVE64-NEXT:    buffer_store_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2696, 32, 17, 64, 16128
+; WAVE64-NEXT:    buffer_store_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2697, 32, 17, 64, 15872
+; WAVE64-NEXT:    buffer_store_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2698, 32, 17, 64, 15616
+; WAVE64-NEXT:    buffer_store_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2699, 32, 17, 64, 15360
+; WAVE64-NEXT:    buffer_store_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2700, 32, 17, 64, 15104
+; WAVE64-NEXT:    buffer_store_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2701, 32, 17, 64, 14848
+; WAVE64-NEXT:    buffer_store_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2702, 32, 17, 64, 14592
+; WAVE64-NEXT:    buffer_store_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2703, 32, 17, 64, 14336
+; WAVE64-NEXT:    buffer_store_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2712, 32, 17, 64, 14080
+; WAVE64-NEXT:    buffer_store_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2713, 32, 17, 64, 13824
+; WAVE64-NEXT:    buffer_store_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2714, 32, 17, 64, 13568
+; WAVE64-NEXT:    buffer_store_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2715, 32, 17, 64, 13312
+; WAVE64-NEXT:    buffer_store_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2716, 32, 17, 64, 13056
+; WAVE64-NEXT:    buffer_store_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2717, 32, 17, 64, 12800
+; WAVE64-NEXT:    buffer_store_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2718, 32, 17, 64, 12544
+; WAVE64-NEXT:    buffer_store_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2719, 32, 17, 64, 12288
+; WAVE64-NEXT:    buffer_store_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2728, 32, 17, 64, 12032
+; WAVE64-NEXT:    buffer_store_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2729, 32, 17, 64, 11776
+; WAVE64-NEXT:    buffer_store_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2730, 32, 17, 64, 11520
+; WAVE64-NEXT:    buffer_store_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2731, 32, 17, 64, 11264
+; WAVE64-NEXT:    buffer_store_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2732, 32, 17, 64, 11008
+; WAVE64-NEXT:    buffer_store_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2733, 32, 17, 64, 10752
+; WAVE64-NEXT:    buffer_store_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2734, 32, 17, 64, 10496
+; WAVE64-NEXT:    buffer_store_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2735, 32, 17, 64, 10240
+; WAVE64-NEXT:    buffer_store_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2744, 32, 17, 64, 9984
+; WAVE64-NEXT:    buffer_store_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2745, 32, 17, 64, 9728
+; WAVE64-NEXT:    buffer_store_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2746, 32, 17, 64, 9472
+; WAVE64-NEXT:    buffer_store_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2747, 32, 17, 64, 9216
+; WAVE64-NEXT:    buffer_store_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2748, 32, 17, 64, 8960
+; WAVE64-NEXT:    buffer_store_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2749, 32, 17, 64, 8704
+; WAVE64-NEXT:    buffer_store_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2750, 32, 17, 64, 8448
+; WAVE64-NEXT:    buffer_store_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2751, 32, 17, 64, 8192
+; WAVE64-NEXT:    buffer_store_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2760, 32, 17, 64, 7936
+; WAVE64-NEXT:    buffer_store_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2761, 32, 17, 64, 7680
+; WAVE64-NEXT:    buffer_store_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2762, 32, 17, 64, 7424
+; WAVE64-NEXT:    buffer_store_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2763, 32, 17, 64, 7168
+; WAVE64-NEXT:    buffer_store_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2764, 32, 17, 64, 6912
+; WAVE64-NEXT:    buffer_store_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2765, 32, 17, 64, 6656
+; WAVE64-NEXT:    buffer_store_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2766, 32, 17, 64, 6400
+; WAVE64-NEXT:    buffer_store_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2767, 32, 17, 64, 6144
+; WAVE64-NEXT:    buffer_store_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2776, 32, 17, 64, 5888
+; WAVE64-NEXT:    buffer_store_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2777, 32, 17, 64, 5632
+; WAVE64-NEXT:    buffer_store_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2778, 32, 17, 64, 5376
+; WAVE64-NEXT:    buffer_store_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2779, 32, 17, 64, 5120
+; WAVE64-NEXT:    buffer_store_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2780, 32, 17, 64, 4864
+; WAVE64-NEXT:    buffer_store_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2781, 32, 17, 64, 4608
+; WAVE64-NEXT:    buffer_store_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2782, 32, 17, 64, 4352
+; WAVE64-NEXT:    buffer_store_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2783, 32, 17, 64, 4096
+; WAVE64-NEXT:    buffer_store_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2792, 32, 17, 64, 3840
+; WAVE64-NEXT:    buffer_store_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2793, 32, 17, 64, 3584
+; WAVE64-NEXT:    buffer_store_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2794, 32, 17, 64, 3328
+; WAVE64-NEXT:    buffer_store_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2795, 32, 17, 64, 3072
+; WAVE64-NEXT:    buffer_store_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2796, 32, 17, 64, 2816
+; WAVE64-NEXT:    buffer_store_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2797, 32, 17, 64, 2560
+; WAVE64-NEXT:    buffer_store_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2798, 32, 17, 64, 2304
+; WAVE64-NEXT:    buffer_store_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2799, 32, 17, 64, 2048
+; WAVE64-NEXT:    buffer_store_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2808, 32, 17, 64, 1792
+; WAVE64-NEXT:    buffer_store_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2809, 32, 17, 64, 1536
+; WAVE64-NEXT:    buffer_store_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2810, 32, 17, 64, 1280
+; WAVE64-NEXT:    buffer_store_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2811, 32, 17, 64, 1024
+; WAVE64-NEXT:    buffer_store_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2812, 32, 17, 64, 768
+; WAVE64-NEXT:    buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2813, 32, 17, 64, 512
+; WAVE64-NEXT:    buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2814, 32, 17, 64, 256
+; WAVE64-NEXT:    buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_llvm_vector_offset 2815, 32, 17, 64, 0
+; WAVE64-NEXT:    s_mov_b64 s[16:17], exec
+; WAVE64-NEXT:    s_mov_b64 exec, 3
+; WAVE64-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:464
+; WAVE64-NEXT:    v_writelane_b32 v0, s30, 0
+; WAVE64-NEXT:    v_writelane_b32 v0, s31, 1
+; WAVE64-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
+; WAVE64-NEXT:    .cfi_offset 16, 28672
+; WAVE64-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:464
+; WAVE64-NEXT:    s_waitcnt vmcnt(0)
+; WAVE64-NEXT:    s_mov_b64 exec, s[16:17]
+; WAVE64-NEXT:    s_getpc_b64 s[16:17]
+; WAVE64-NEXT:    s_add_u32 s16, s16, caller_needs_to_spill_pc_to_memory at rel32@lo+4
+; WAVE64-NEXT:    s_addc_u32 s17, s17, caller_needs_to_spill_pc_to_memory at rel32@hi+12
+; WAVE64-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; WAVE64-NEXT:    s_mov_b64 s[4:5], exec
+; WAVE64-NEXT:    s_mov_b64 exec, 3
+; WAVE64-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:464
+; WAVE64-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
+; WAVE64-NEXT:    s_waitcnt vmcnt(0)
+; WAVE64-NEXT:    v_readlane_b32 s30, v0, 0
+; WAVE64-NEXT:    v_readlane_b32 s31, v0, 1
+; WAVE64-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:464
+; WAVE64-NEXT:    s_waitcnt vmcnt(0)
+; WAVE64-NEXT:    s_mov_b64 exec, s[4:5]
+; WAVE64-NEXT:    buffer_load_dword v255, off, s[0:3], s33 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v95, off, s[0:3], s33 offset:320 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v94, off, s[0:3], s33 offset:324 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v93, off, s[0:3], s33 offset:328 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v92, off, s[0:3], s33 offset:332 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v91, off, s[0:3], s33 offset:336 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v90, off, s[0:3], s33 offset:340 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v89, off, s[0:3], s33 offset:344 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v88, off, s[0:3], s33 offset:348 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v79, off, s[0:3], s33 offset:352 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v78, off, s[0:3], s33 offset:356 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v77, off, s[0:3], s33 offset:360 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v76, off, s[0:3], s33 offset:364 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v75, off, s[0:3], s33 offset:368 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v74, off, s[0:3], s33 offset:372 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v73, off, s[0:3], s33 offset:376 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v72, off, s[0:3], s33 offset:380 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:384 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:388 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:392 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:396 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:400 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:404 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:408 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:412 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:416 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:420 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:424 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:428 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload
+; WAVE64-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload
+; WAVE64-NEXT:    s_mov_b32 s32, s33
+; WAVE64-NEXT:    .cfi_def_cfa_register 64
+; WAVE64-NEXT:    s_mov_b32 s33, s18
+; WAVE64-NEXT:    s_waitcnt vmcnt(0)
+; WAVE64-NEXT:    s_setpc_b64 s[30:31]
+;
+; WAVE32-LABEL: need_to_spill_pc_to_mem:
+; WAVE32:       .Lfunc_begin7:
+; WAVE32-NEXT:    .cfi_startproc
+; WAVE32-NEXT:  ; %bb.0:
+; WAVE32-NEXT:    .cfi_llvm_def_aspace_cfa 64, 0, 6
+; WAVE32-NEXT:    .cfi_llvm_register_pair 16, 62, 32, 63, 32
+; WAVE32-NEXT:    .cfi_undefined 1536
+; WAVE32-NEXT:    .cfi_undefined 1537
+; WAVE32-NEXT:    .cfi_undefined 1538
+; WAVE32-NEXT:    .cfi_undefined 1539
+; WAVE32-NEXT:    .cfi_undefined 1540
+; WAVE32-NEXT:    .cfi_undefined 1541
+; WAVE32-NEXT:    .cfi_undefined 1542
+; WAVE32-NEXT:    .cfi_undefined 1543
+; WAVE32-NEXT:    .cfi_undefined 1544
+; WAVE32-NEXT:    .cfi_undefined 1545
+; WAVE32-NEXT:    .cfi_undefined 1546
+; WAVE32-NEXT:    .cfi_undefined 1547
+; WAVE32-NEXT:    .cfi_undefined 1548
+; WAVE32-NEXT:    .cfi_undefined 1549
+; WAVE32-NEXT:    .cfi_undefined 1550
+; WAVE32-NEXT:    .cfi_undefined 1551
+; WAVE32-NEXT:    .cfi_undefined 1552
+; WAVE32-NEXT:    .cfi_undefined 1553
+; WAVE32-NEXT:    .cfi_undefined 1554
+; WAVE32-NEXT:    .cfi_undefined 1555
+; WAVE32-NEXT:    .cfi_undefined 1556
+; WAVE32-NEXT:    .cfi_undefined 1557
+; WAVE32-NEXT:    .cfi_undefined 1558
+; WAVE32-NEXT:    .cfi_undefined 1559
+; WAVE32-NEXT:    .cfi_undefined 1560
+; WAVE32-NEXT:    .cfi_undefined 1561
+; WAVE32-NEXT:    .cfi_undefined 1562
+; WAVE32-NEXT:    .cfi_undefined 1563
+; WAVE32-NEXT:    .cfi_undefined 1564
+; WAVE32-NEXT:    .cfi_undefined 1565
+; WAVE32-NEXT:    .cfi_undefined 1566
+; WAVE32-NEXT:    .cfi_undefined 1567
+; WAVE32-NEXT:    .cfi_undefined 1568
+; WAVE32-NEXT:    .cfi_undefined 1569
+; WAVE32-NEXT:    .cfi_undefined 1570
+; WAVE32-NEXT:    .cfi_undefined 1571
+; WAVE32-NEXT:    .cfi_undefined 1572
+; WAVE32-NEXT:    .cfi_undefined 1573
+; WAVE32-NEXT:    .cfi_undefined 1574
+; WAVE32-NEXT:    .cfi_undefined 1575
+; WAVE32-NEXT:    .cfi_undefined 1584
+; WAVE32-NEXT:    .cfi_undefined 1585
+; WAVE32-NEXT:    .cfi_undefined 1586
+; WAVE32-NEXT:    .cfi_undefined 1587
+; WAVE32-NEXT:    .cfi_undefined 1588
+; WAVE32-NEXT:    .cfi_undefined 1589
+; WAVE32-NEXT:    .cfi_undefined 1590
+; WAVE32-NEXT:    .cfi_undefined 1591
+; WAVE32-NEXT:    .cfi_undefined 1600
+; WAVE32-NEXT:    .cfi_undefined 1601
+; WAVE32-NEXT:    .cfi_undefined 1602
+; WAVE32-NEXT:    .cfi_undefined 1603
+; WAVE32-NEXT:    .cfi_undefined 1604
+; WAVE32-NEXT:    .cfi_undefined 1605
+; WAVE32-NEXT:    .cfi_undefined 1606
+; WAVE32-NEXT:    .cfi_undefined 1607
+; WAVE32-NEXT:    .cfi_undefined 1616
+; WAVE32-NEXT:    .cfi_undefined 1617
+; WAVE32-NEXT:    .cfi_undefined 1618
+; WAVE32-NEXT:    .cfi_undefined 1619
+; WAVE32-NEXT:    .cfi_undefined 1620
+; WAVE32-NEXT:    .cfi_undefined 1621
+; WAVE32-NEXT:    .cfi_undefined 1622
+; WAVE32-NEXT:    .cfi_undefined 1623
+; WAVE32-NEXT:    .cfi_undefined 1632
+; WAVE32-NEXT:    .cfi_undefined 1633
+; WAVE32-NEXT:    .cfi_undefined 1634
+; WAVE32-NEXT:    .cfi_undefined 1635
+; WAVE32-NEXT:    .cfi_undefined 1636
+; WAVE32-NEXT:    .cfi_undefined 1637
+; WAVE32-NEXT:    .cfi_undefined 1638
+; WAVE32-NEXT:    .cfi_undefined 1639
+; WAVE32-NEXT:    .cfi_undefined 1648
+; WAVE32-NEXT:    .cfi_undefined 1649
+; WAVE32-NEXT:    .cfi_undefined 1650
+; WAVE32-NEXT:    .cfi_undefined 1651
+; WAVE32-NEXT:    .cfi_undefined 1652
+; WAVE32-NEXT:    .cfi_undefined 1653
+; WAVE32-NEXT:    .cfi_undefined 1654
+; WAVE32-NEXT:    .cfi_undefined 1655
+; WAVE32-NEXT:    .cfi_undefined 1664
+; WAVE32-NEXT:    .cfi_undefined 1665
+; WAVE32-NEXT:    .cfi_undefined 1666
+; WAVE32-NEXT:    .cfi_undefined 1667
+; WAVE32-NEXT:    .cfi_undefined 1668
+; WAVE32-NEXT:    .cfi_undefined 1669
+; WAVE32-NEXT:    .cfi_undefined 1670
+; WAVE32-NEXT:    .cfi_undefined 1671
+; WAVE32-NEXT:    .cfi_undefined 1680
+; WAVE32-NEXT:    .cfi_undefined 1681
+; WAVE32-NEXT:    .cfi_undefined 1682
+; WAVE32-NEXT:    .cfi_undefined 1683
+; WAVE32-NEXT:    .cfi_undefined 1684
+; WAVE32-NEXT:    .cfi_undefined 1685
+; WAVE32-NEXT:    .cfi_undefined 1686
+; WAVE32-NEXT:    .cfi_undefined 1687
+; WAVE32-NEXT:    .cfi_undefined 1696
+; WAVE32-NEXT:    .cfi_undefined 1697
+; WAVE32-NEXT:    .cfi_undefined 1698
+; WAVE32-NEXT:    .cfi_undefined 1699
+; WAVE32-NEXT:    .cfi_undefined 1700
+; WAVE32-NEXT:    .cfi_undefined 1701
+; WAVE32-NEXT:    .cfi_undefined 1702
+; WAVE32-NEXT:    .cfi_undefined 1703
+; WAVE32-NEXT:    .cfi_undefined 1712
+; WAVE32-NEXT:    .cfi_undefined 1713
+; WAVE32-NEXT:    .cfi_undefined 1714
+; WAVE32-NEXT:    .cfi_undefined 1715
+; WAVE32-NEXT:    .cfi_undefined 1716
+; WAVE32-NEXT:    .cfi_undefined 1717
+; WAVE32-NEXT:    .cfi_undefined 1718
+; WAVE32-NEXT:    .cfi_undefined 1719
+; WAVE32-NEXT:    .cfi_undefined 1728
+; WAVE32-NEXT:    .cfi_undefined 1729
+; WAVE32-NEXT:    .cfi_undefined 1730
+; WAVE32-NEXT:    .cfi_undefined 1731
+; WAVE32-NEXT:    .cfi_undefined 1732
+; WAVE32-NEXT:    .cfi_undefined 1733
+; WAVE32-NEXT:    .cfi_undefined 1734
+; WAVE32-NEXT:    .cfi_undefined 1735
+; WAVE32-NEXT:    .cfi_undefined 1744
+; WAVE32-NEXT:    .cfi_undefined 1745
+; WAVE32-NEXT:    .cfi_undefined 1746
+; WAVE32-NEXT:    .cfi_undefined 1747
+; WAVE32-NEXT:    .cfi_undefined 1748
+; WAVE32-NEXT:    .cfi_undefined 1749
+; WAVE32-NEXT:    .cfi_undefined 1750
+; WAVE32-NEXT:    .cfi_undefined 1751
+; WAVE32-NEXT:    .cfi_undefined 1760
+; WAVE32-NEXT:    .cfi_undefined 1761
+; WAVE32-NEXT:    .cfi_undefined 1762
+; WAVE32-NEXT:    .cfi_undefined 1763
+; WAVE32-NEXT:    .cfi_undefined 1764
+; WAVE32-NEXT:    .cfi_undefined 1765
+; WAVE32-NEXT:    .cfi_undefined 1766
+; WAVE32-NEXT:    .cfi_undefined 1767
+; WAVE32-NEXT:    .cfi_undefined 1776
+; WAVE32-NEXT:    .cfi_undefined 1777
+; WAVE32-NEXT:    .cfi_undefined 1778
+; WAVE32-NEXT:    .cfi_undefined 1779
+; WAVE32-NEXT:    .cfi_undefined 1780
+; WAVE32-NEXT:    .cfi_undefined 1781
+; WAVE32-NEXT:    .cfi_undefined 1782
+; WAVE32-NEXT:    .cfi_undefined 1783
+; WAVE32-NEXT:    .cfi_undefined 48
+; WAVE32-NEXT:    .cfi_undefined 49
+; WAVE32-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; WAVE32-NEXT:    s_mov_b32 s18, s33
+; WAVE32-NEXT:    .cfi_register 65, 50
+; WAVE32-NEXT:    v_mov_b32_e32 v0, exec_lo
+; WAVE32-NEXT:    s_mov_b32 s33, s32
+; WAVE32-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:456 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_offset 1, 14592
+; WAVE32-NEXT:    .cfi_def_cfa_register 65
+; WAVE32-NEXT:    s_addk_i32 s32, 0x3a00
+; WAVE32-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1576, 32, 1, 32, 14208
+; WAVE32-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1577, 32, 1, 32, 14080
+; WAVE32-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1578, 32, 1, 32, 13952
+; WAVE32-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:432 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1579, 32, 1, 32, 13824
+; WAVE32-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:428 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1580, 32, 1, 32, 13696
+; WAVE32-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:424 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1581, 32, 1, 32, 13568
+; WAVE32-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:420 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1582, 32, 1, 32, 13440
+; WAVE32-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:416 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1583, 32, 1, 32, 13312
+; WAVE32-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:412 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1592, 32, 1, 32, 13184
+; WAVE32-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:408 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1593, 32, 1, 32, 13056
+; WAVE32-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:404 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1594, 32, 1, 32, 12928
+; WAVE32-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:400 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1595, 32, 1, 32, 12800
+; WAVE32-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:396 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1596, 32, 1, 32, 12672
+; WAVE32-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:392 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1597, 32, 1, 32, 12544
+; WAVE32-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:388 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1598, 32, 1, 32, 12416
+; WAVE32-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:384 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1599, 32, 1, 32, 12288
+; WAVE32-NEXT:    buffer_store_dword v72, off, s[0:3], s33 offset:380 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1608, 32, 1, 32, 12160
+; WAVE32-NEXT:    buffer_store_dword v73, off, s[0:3], s33 offset:376 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1609, 32, 1, 32, 12032
+; WAVE32-NEXT:    buffer_store_dword v74, off, s[0:3], s33 offset:372 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1610, 32, 1, 32, 11904
+; WAVE32-NEXT:    buffer_store_dword v75, off, s[0:3], s33 offset:368 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1611, 32, 1, 32, 11776
+; WAVE32-NEXT:    buffer_store_dword v76, off, s[0:3], s33 offset:364 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1612, 32, 1, 32, 11648
+; WAVE32-NEXT:    buffer_store_dword v77, off, s[0:3], s33 offset:360 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1613, 32, 1, 32, 11520
+; WAVE32-NEXT:    buffer_store_dword v78, off, s[0:3], s33 offset:356 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1614, 32, 1, 32, 11392
+; WAVE32-NEXT:    buffer_store_dword v79, off, s[0:3], s33 offset:352 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1615, 32, 1, 32, 11264
+; WAVE32-NEXT:    buffer_store_dword v88, off, s[0:3], s33 offset:348 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1624, 32, 1, 32, 11136
+; WAVE32-NEXT:    buffer_store_dword v89, off, s[0:3], s33 offset:344 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1625, 32, 1, 32, 11008
+; WAVE32-NEXT:    buffer_store_dword v90, off, s[0:3], s33 offset:340 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1626, 32, 1, 32, 10880
+; WAVE32-NEXT:    buffer_store_dword v91, off, s[0:3], s33 offset:336 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1627, 32, 1, 32, 10752
+; WAVE32-NEXT:    buffer_store_dword v92, off, s[0:3], s33 offset:332 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1628, 32, 1, 32, 10624
+; WAVE32-NEXT:    buffer_store_dword v93, off, s[0:3], s33 offset:328 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1629, 32, 1, 32, 10496
+; WAVE32-NEXT:    buffer_store_dword v94, off, s[0:3], s33 offset:324 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1630, 32, 1, 32, 10368
+; WAVE32-NEXT:    buffer_store_dword v95, off, s[0:3], s33 offset:320 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1631, 32, 1, 32, 10240
+; WAVE32-NEXT:    buffer_store_dword v104, off, s[0:3], s33 offset:316 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1640, 32, 1, 32, 10112
+; WAVE32-NEXT:    buffer_store_dword v105, off, s[0:3], s33 offset:312 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1641, 32, 1, 32, 9984
+; WAVE32-NEXT:    buffer_store_dword v106, off, s[0:3], s33 offset:308 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1642, 32, 1, 32, 9856
+; WAVE32-NEXT:    buffer_store_dword v107, off, s[0:3], s33 offset:304 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1643, 32, 1, 32, 9728
+; WAVE32-NEXT:    buffer_store_dword v108, off, s[0:3], s33 offset:300 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1644, 32, 1, 32, 9600
+; WAVE32-NEXT:    buffer_store_dword v109, off, s[0:3], s33 offset:296 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1645, 32, 1, 32, 9472
+; WAVE32-NEXT:    buffer_store_dword v110, off, s[0:3], s33 offset:292 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1646, 32, 1, 32, 9344
+; WAVE32-NEXT:    buffer_store_dword v111, off, s[0:3], s33 offset:288 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1647, 32, 1, 32, 9216
+; WAVE32-NEXT:    buffer_store_dword v120, off, s[0:3], s33 offset:284 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1656, 32, 1, 32, 9088
+; WAVE32-NEXT:    buffer_store_dword v121, off, s[0:3], s33 offset:280 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1657, 32, 1, 32, 8960
+; WAVE32-NEXT:    buffer_store_dword v122, off, s[0:3], s33 offset:276 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1658, 32, 1, 32, 8832
+; WAVE32-NEXT:    buffer_store_dword v123, off, s[0:3], s33 offset:272 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1659, 32, 1, 32, 8704
+; WAVE32-NEXT:    buffer_store_dword v124, off, s[0:3], s33 offset:268 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1660, 32, 1, 32, 8576
+; WAVE32-NEXT:    buffer_store_dword v125, off, s[0:3], s33 offset:264 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1661, 32, 1, 32, 8448
+; WAVE32-NEXT:    buffer_store_dword v126, off, s[0:3], s33 offset:260 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1662, 32, 1, 32, 8320
+; WAVE32-NEXT:    buffer_store_dword v127, off, s[0:3], s33 offset:256 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1663, 32, 1, 32, 8192
+; WAVE32-NEXT:    buffer_store_dword v136, off, s[0:3], s33 offset:252 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1672, 32, 1, 32, 8064
+; WAVE32-NEXT:    buffer_store_dword v137, off, s[0:3], s33 offset:248 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1673, 32, 1, 32, 7936
+; WAVE32-NEXT:    buffer_store_dword v138, off, s[0:3], s33 offset:244 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1674, 32, 1, 32, 7808
+; WAVE32-NEXT:    buffer_store_dword v139, off, s[0:3], s33 offset:240 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1675, 32, 1, 32, 7680
+; WAVE32-NEXT:    buffer_store_dword v140, off, s[0:3], s33 offset:236 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1676, 32, 1, 32, 7552
+; WAVE32-NEXT:    buffer_store_dword v141, off, s[0:3], s33 offset:232 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1677, 32, 1, 32, 7424
+; WAVE32-NEXT:    buffer_store_dword v142, off, s[0:3], s33 offset:228 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1678, 32, 1, 32, 7296
+; WAVE32-NEXT:    buffer_store_dword v143, off, s[0:3], s33 offset:224 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1679, 32, 1, 32, 7168
+; WAVE32-NEXT:    buffer_store_dword v152, off, s[0:3], s33 offset:220 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1688, 32, 1, 32, 7040
+; WAVE32-NEXT:    buffer_store_dword v153, off, s[0:3], s33 offset:216 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1689, 32, 1, 32, 6912
+; WAVE32-NEXT:    buffer_store_dword v154, off, s[0:3], s33 offset:212 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1690, 32, 1, 32, 6784
+; WAVE32-NEXT:    buffer_store_dword v155, off, s[0:3], s33 offset:208 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1691, 32, 1, 32, 6656
+; WAVE32-NEXT:    buffer_store_dword v156, off, s[0:3], s33 offset:204 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1692, 32, 1, 32, 6528
+; WAVE32-NEXT:    buffer_store_dword v157, off, s[0:3], s33 offset:200 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1693, 32, 1, 32, 6400
+; WAVE32-NEXT:    buffer_store_dword v158, off, s[0:3], s33 offset:196 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1694, 32, 1, 32, 6272
+; WAVE32-NEXT:    buffer_store_dword v159, off, s[0:3], s33 offset:192 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1695, 32, 1, 32, 6144
+; WAVE32-NEXT:    buffer_store_dword v168, off, s[0:3], s33 offset:188 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1704, 32, 1, 32, 6016
+; WAVE32-NEXT:    buffer_store_dword v169, off, s[0:3], s33 offset:184 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1705, 32, 1, 32, 5888
+; WAVE32-NEXT:    buffer_store_dword v170, off, s[0:3], s33 offset:180 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1706, 32, 1, 32, 5760
+; WAVE32-NEXT:    buffer_store_dword v171, off, s[0:3], s33 offset:176 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1707, 32, 1, 32, 5632
+; WAVE32-NEXT:    buffer_store_dword v172, off, s[0:3], s33 offset:172 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1708, 32, 1, 32, 5504
+; WAVE32-NEXT:    buffer_store_dword v173, off, s[0:3], s33 offset:168 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1709, 32, 1, 32, 5376
+; WAVE32-NEXT:    buffer_store_dword v174, off, s[0:3], s33 offset:164 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1710, 32, 1, 32, 5248
+; WAVE32-NEXT:    buffer_store_dword v175, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1711, 32, 1, 32, 5120
+; WAVE32-NEXT:    buffer_store_dword v184, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1720, 32, 1, 32, 4992
+; WAVE32-NEXT:    buffer_store_dword v185, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1721, 32, 1, 32, 4864
+; WAVE32-NEXT:    buffer_store_dword v186, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1722, 32, 1, 32, 4736
+; WAVE32-NEXT:    buffer_store_dword v187, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1723, 32, 1, 32, 4608
+; WAVE32-NEXT:    buffer_store_dword v188, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1724, 32, 1, 32, 4480
+; WAVE32-NEXT:    buffer_store_dword v189, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1725, 32, 1, 32, 4352
+; WAVE32-NEXT:    buffer_store_dword v190, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1726, 32, 1, 32, 4224
+; WAVE32-NEXT:    buffer_store_dword v191, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1727, 32, 1, 32, 4096
+; WAVE32-NEXT:    buffer_store_dword v200, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1736, 32, 1, 32, 3968
+; WAVE32-NEXT:    buffer_store_dword v201, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1737, 32, 1, 32, 3840
+; WAVE32-NEXT:    buffer_store_dword v202, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1738, 32, 1, 32, 3712
+; WAVE32-NEXT:    buffer_store_dword v203, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1739, 32, 1, 32, 3584
+; WAVE32-NEXT:    buffer_store_dword v204, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1740, 32, 1, 32, 3456
+; WAVE32-NEXT:    buffer_store_dword v205, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1741, 32, 1, 32, 3328
+; WAVE32-NEXT:    buffer_store_dword v206, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1742, 32, 1, 32, 3200
+; WAVE32-NEXT:    buffer_store_dword v207, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1743, 32, 1, 32, 3072
+; WAVE32-NEXT:    buffer_store_dword v216, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1752, 32, 1, 32, 2944
+; WAVE32-NEXT:    buffer_store_dword v217, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1753, 32, 1, 32, 2816
+; WAVE32-NEXT:    buffer_store_dword v218, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1754, 32, 1, 32, 2688
+; WAVE32-NEXT:    buffer_store_dword v219, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1755, 32, 1, 32, 2560
+; WAVE32-NEXT:    buffer_store_dword v220, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1756, 32, 1, 32, 2432
+; WAVE32-NEXT:    buffer_store_dword v221, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1757, 32, 1, 32, 2304
+; WAVE32-NEXT:    buffer_store_dword v222, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1758, 32, 1, 32, 2176
+; WAVE32-NEXT:    buffer_store_dword v223, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1759, 32, 1, 32, 2048
+; WAVE32-NEXT:    buffer_store_dword v232, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1768, 32, 1, 32, 1920
+; WAVE32-NEXT:    buffer_store_dword v233, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1769, 32, 1, 32, 1792
+; WAVE32-NEXT:    buffer_store_dword v234, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1770, 32, 1, 32, 1664
+; WAVE32-NEXT:    buffer_store_dword v235, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1771, 32, 1, 32, 1536
+; WAVE32-NEXT:    buffer_store_dword v236, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1772, 32, 1, 32, 1408
+; WAVE32-NEXT:    buffer_store_dword v237, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1773, 32, 1, 32, 1280
+; WAVE32-NEXT:    buffer_store_dword v238, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1774, 32, 1, 32, 1152
+; WAVE32-NEXT:    buffer_store_dword v239, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1775, 32, 1, 32, 1024
+; WAVE32-NEXT:    buffer_store_dword v248, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1784, 32, 1, 32, 896
+; WAVE32-NEXT:    buffer_store_dword v249, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1785, 32, 1, 32, 768
+; WAVE32-NEXT:    buffer_store_dword v250, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1786, 32, 1, 32, 640
+; WAVE32-NEXT:    buffer_store_dword v251, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1787, 32, 1, 32, 512
+; WAVE32-NEXT:    buffer_store_dword v252, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1788, 32, 1, 32, 384
+; WAVE32-NEXT:    buffer_store_dword v253, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1789, 32, 1, 32, 256
+; WAVE32-NEXT:    buffer_store_dword v254, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1790, 32, 1, 32, 128
+; WAVE32-NEXT:    buffer_store_dword v255, off, s[0:3], s33 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_llvm_vector_offset 1791, 32, 1, 32, 0
+; WAVE32-NEXT:    s_mov_b32 s16, exec_lo
+; WAVE32-NEXT:    s_waitcnt_depctr 0xffe3
+; WAVE32-NEXT:    s_mov_b32 exec_lo, 3
+; WAVE32-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:460
+; WAVE32-NEXT:    v_writelane_b32 v0, s30, 0
+; WAVE32-NEXT:    v_writelane_b32 v0, s31, 1
+; WAVE32-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
+; WAVE32-NEXT:    .cfi_offset 16, 14336
+; WAVE32-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:460
+; WAVE32-NEXT:    s_waitcnt vmcnt(0)
+; WAVE32-NEXT:    s_waitcnt_depctr 0xffe3
+; WAVE32-NEXT:    s_mov_b32 exec_lo, s16
+; WAVE32-NEXT:    s_getpc_b64 s[16:17]
+; WAVE32-NEXT:    s_add_u32 s16, s16, caller_needs_to_spill_pc_to_memory at rel32@lo+4
+; WAVE32-NEXT:    s_addc_u32 s17, s17, caller_needs_to_spill_pc_to_memory at rel32@hi+12
+; WAVE32-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; WAVE32-NEXT:    s_mov_b32 s4, exec_lo
+; WAVE32-NEXT:    s_mov_b32 exec_lo, 3
+; WAVE32-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:460
+; WAVE32-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
+; WAVE32-NEXT:    s_waitcnt vmcnt(0)
+; WAVE32-NEXT:    v_readlane_b32 s30, v0, 0
+; WAVE32-NEXT:    v_readlane_b32 s31, v0, 1
+; WAVE32-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:460
+; WAVE32-NEXT:    s_waitcnt vmcnt(0)
+; WAVE32-NEXT:    s_waitcnt_depctr 0xffe3
+; WAVE32-NEXT:    s_mov_b32 exec_lo, s4
+; WAVE32-NEXT:    s_clause 0x3e
+; WAVE32-NEXT:    buffer_load_dword v255, off, s[0:3], s33
+; WAVE32-NEXT:    buffer_load_dword v254, off, s[0:3], s33 offset:4
+; WAVE32-NEXT:    buffer_load_dword v253, off, s[0:3], s33 offset:8
+; WAVE32-NEXT:    buffer_load_dword v252, off, s[0:3], s33 offset:12
+; WAVE32-NEXT:    buffer_load_dword v251, off, s[0:3], s33 offset:16
+; WAVE32-NEXT:    buffer_load_dword v250, off, s[0:3], s33 offset:20
+; WAVE32-NEXT:    buffer_load_dword v249, off, s[0:3], s33 offset:24
+; WAVE32-NEXT:    buffer_load_dword v248, off, s[0:3], s33 offset:28
+; WAVE32-NEXT:    buffer_load_dword v239, off, s[0:3], s33 offset:32
+; WAVE32-NEXT:    buffer_load_dword v238, off, s[0:3], s33 offset:36
+; WAVE32-NEXT:    buffer_load_dword v237, off, s[0:3], s33 offset:40
+; WAVE32-NEXT:    buffer_load_dword v236, off, s[0:3], s33 offset:44
+; WAVE32-NEXT:    buffer_load_dword v235, off, s[0:3], s33 offset:48
+; WAVE32-NEXT:    buffer_load_dword v234, off, s[0:3], s33 offset:52
+; WAVE32-NEXT:    buffer_load_dword v233, off, s[0:3], s33 offset:56
+; WAVE32-NEXT:    buffer_load_dword v232, off, s[0:3], s33 offset:60
+; WAVE32-NEXT:    buffer_load_dword v223, off, s[0:3], s33 offset:64
+; WAVE32-NEXT:    buffer_load_dword v222, off, s[0:3], s33 offset:68
+; WAVE32-NEXT:    buffer_load_dword v221, off, s[0:3], s33 offset:72
+; WAVE32-NEXT:    buffer_load_dword v220, off, s[0:3], s33 offset:76
+; WAVE32-NEXT:    buffer_load_dword v219, off, s[0:3], s33 offset:80
+; WAVE32-NEXT:    buffer_load_dword v218, off, s[0:3], s33 offset:84
+; WAVE32-NEXT:    buffer_load_dword v217, off, s[0:3], s33 offset:88
+; WAVE32-NEXT:    buffer_load_dword v216, off, s[0:3], s33 offset:92
+; WAVE32-NEXT:    buffer_load_dword v207, off, s[0:3], s33 offset:96
+; WAVE32-NEXT:    buffer_load_dword v206, off, s[0:3], s33 offset:100
+; WAVE32-NEXT:    buffer_load_dword v205, off, s[0:3], s33 offset:104
+; WAVE32-NEXT:    buffer_load_dword v204, off, s[0:3], s33 offset:108
+; WAVE32-NEXT:    buffer_load_dword v203, off, s[0:3], s33 offset:112
+; WAVE32-NEXT:    buffer_load_dword v202, off, s[0:3], s33 offset:116
+; WAVE32-NEXT:    buffer_load_dword v201, off, s[0:3], s33 offset:120
+; WAVE32-NEXT:    buffer_load_dword v200, off, s[0:3], s33 offset:124
+; WAVE32-NEXT:    buffer_load_dword v191, off, s[0:3], s33 offset:128
+; WAVE32-NEXT:    buffer_load_dword v190, off, s[0:3], s33 offset:132
+; WAVE32-NEXT:    buffer_load_dword v189, off, s[0:3], s33 offset:136
+; WAVE32-NEXT:    buffer_load_dword v188, off, s[0:3], s33 offset:140
+; WAVE32-NEXT:    buffer_load_dword v187, off, s[0:3], s33 offset:144
+; WAVE32-NEXT:    buffer_load_dword v186, off, s[0:3], s33 offset:148
+; WAVE32-NEXT:    buffer_load_dword v185, off, s[0:3], s33 offset:152
+; WAVE32-NEXT:    buffer_load_dword v184, off, s[0:3], s33 offset:156
+; WAVE32-NEXT:    buffer_load_dword v175, off, s[0:3], s33 offset:160
+; WAVE32-NEXT:    buffer_load_dword v174, off, s[0:3], s33 offset:164
+; WAVE32-NEXT:    buffer_load_dword v173, off, s[0:3], s33 offset:168
+; WAVE32-NEXT:    buffer_load_dword v172, off, s[0:3], s33 offset:172
+; WAVE32-NEXT:    buffer_load_dword v171, off, s[0:3], s33 offset:176
+; WAVE32-NEXT:    buffer_load_dword v170, off, s[0:3], s33 offset:180
+; WAVE32-NEXT:    buffer_load_dword v169, off, s[0:3], s33 offset:184
+; WAVE32-NEXT:    buffer_load_dword v168, off, s[0:3], s33 offset:188
+; WAVE32-NEXT:    buffer_load_dword v159, off, s[0:3], s33 offset:192
+; WAVE32-NEXT:    buffer_load_dword v158, off, s[0:3], s33 offset:196
+; WAVE32-NEXT:    buffer_load_dword v157, off, s[0:3], s33 offset:200
+; WAVE32-NEXT:    buffer_load_dword v156, off, s[0:3], s33 offset:204
+; WAVE32-NEXT:    buffer_load_dword v155, off, s[0:3], s33 offset:208
+; WAVE32-NEXT:    buffer_load_dword v154, off, s[0:3], s33 offset:212
+; WAVE32-NEXT:    buffer_load_dword v153, off, s[0:3], s33 offset:216
+; WAVE32-NEXT:    buffer_load_dword v152, off, s[0:3], s33 offset:220
+; WAVE32-NEXT:    buffer_load_dword v143, off, s[0:3], s33 offset:224
+; WAVE32-NEXT:    buffer_load_dword v142, off, s[0:3], s33 offset:228
+; WAVE32-NEXT:    buffer_load_dword v141, off, s[0:3], s33 offset:232
+; WAVE32-NEXT:    buffer_load_dword v140, off, s[0:3], s33 offset:236
+; WAVE32-NEXT:    buffer_load_dword v139, off, s[0:3], s33 offset:240
+; WAVE32-NEXT:    buffer_load_dword v138, off, s[0:3], s33 offset:244
+; WAVE32-NEXT:    buffer_load_dword v137, off, s[0:3], s33 offset:248
+; WAVE32-NEXT:    s_clause 0x30
+; WAVE32-NEXT:    buffer_load_dword v136, off, s[0:3], s33 offset:252
+; WAVE32-NEXT:    buffer_load_dword v127, off, s[0:3], s33 offset:256
+; WAVE32-NEXT:    buffer_load_dword v126, off, s[0:3], s33 offset:260
+; WAVE32-NEXT:    buffer_load_dword v125, off, s[0:3], s33 offset:264
+; WAVE32-NEXT:    buffer_load_dword v124, off, s[0:3], s33 offset:268
+; WAVE32-NEXT:    buffer_load_dword v123, off, s[0:3], s33 offset:272
+; WAVE32-NEXT:    buffer_load_dword v122, off, s[0:3], s33 offset:276
+; WAVE32-NEXT:    buffer_load_dword v121, off, s[0:3], s33 offset:280
+; WAVE32-NEXT:    buffer_load_dword v120, off, s[0:3], s33 offset:284
+; WAVE32-NEXT:    buffer_load_dword v111, off, s[0:3], s33 offset:288
+; WAVE32-NEXT:    buffer_load_dword v110, off, s[0:3], s33 offset:292
+; WAVE32-NEXT:    buffer_load_dword v109, off, s[0:3], s33 offset:296
+; WAVE32-NEXT:    buffer_load_dword v108, off, s[0:3], s33 offset:300
+; WAVE32-NEXT:    buffer_load_dword v107, off, s[0:3], s33 offset:304
+; WAVE32-NEXT:    buffer_load_dword v106, off, s[0:3], s33 offset:308
+; WAVE32-NEXT:    buffer_load_dword v105, off, s[0:3], s33 offset:312
+; WAVE32-NEXT:    buffer_load_dword v104, off, s[0:3], s33 offset:316
+; WAVE32-NEXT:    buffer_load_dword v95, off, s[0:3], s33 offset:320
+; WAVE32-NEXT:    buffer_load_dword v94, off, s[0:3], s33 offset:324
+; WAVE32-NEXT:    buffer_load_dword v93, off, s[0:3], s33 offset:328
+; WAVE32-NEXT:    buffer_load_dword v92, off, s[0:3], s33 offset:332
+; WAVE32-NEXT:    buffer_load_dword v91, off, s[0:3], s33 offset:336
+; WAVE32-NEXT:    buffer_load_dword v90, off, s[0:3], s33 offset:340
+; WAVE32-NEXT:    buffer_load_dword v89, off, s[0:3], s33 offset:344
+; WAVE32-NEXT:    buffer_load_dword v88, off, s[0:3], s33 offset:348
+; WAVE32-NEXT:    buffer_load_dword v79, off, s[0:3], s33 offset:352
+; WAVE32-NEXT:    buffer_load_dword v78, off, s[0:3], s33 offset:356
+; WAVE32-NEXT:    buffer_load_dword v77, off, s[0:3], s33 offset:360
+; WAVE32-NEXT:    buffer_load_dword v76, off, s[0:3], s33 offset:364
+; WAVE32-NEXT:    buffer_load_dword v75, off, s[0:3], s33 offset:368
+; WAVE32-NEXT:    buffer_load_dword v74, off, s[0:3], s33 offset:372
+; WAVE32-NEXT:    buffer_load_dword v73, off, s[0:3], s33 offset:376
+; WAVE32-NEXT:    buffer_load_dword v72, off, s[0:3], s33 offset:380
+; WAVE32-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:384
+; WAVE32-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:388
+; WAVE32-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:392
+; WAVE32-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:396
+; WAVE32-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:400
+; WAVE32-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:404
+; WAVE32-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:408
+; WAVE32-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:412
+; WAVE32-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:416
+; WAVE32-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:420
+; WAVE32-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:424
+; WAVE32-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:428
+; WAVE32-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:432
+; WAVE32-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:436
+; WAVE32-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:440
+; WAVE32-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:444
+; WAVE32-NEXT:    s_mov_b32 s32, s33
+; WAVE32-NEXT:    .cfi_def_cfa_register 64
+; WAVE32-NEXT:    s_waitcnt_depctr 0xffe3
+; WAVE32-NEXT:    s_mov_b32 s33, s18
+; WAVE32-NEXT:    s_waitcnt vmcnt(0)
+; WAVE32-NEXT:    s_setpc_b64 s[30:31]
+  call void @caller_needs_to_spill_pc_to_memory()
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" }
+attributes #2 = { nounwind "frame-pointer"="all" "amdgpu-waves-per-eu"="12,12" }
+attributes #3 = { nounwind norecurse }
+
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, emissionKind: FullDebug)
+!1 = !DIFile(filename: "filename", directory: "directory")
+!2 = !{i32 7, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}



More information about the llvm-branch-commits mailing list