[llvm] 7c4e711 - [AMDGPU] Enable base pointer.

Sun May 17 04:16:27 PDT 2020

Author: Christudasan Devadasan
Date: 2020-05-17T16:13:55+05:30
New Revision: 7c4e711ef8d8500349d7b33910f53edbb676fa67

URL: https://github.com/llvm/llvm-project/commit/7c4e711ef8d8500349d7b33910f53edbb676fa67
DIFF: https://github.com/llvm/llvm-project/commit/7c4e711ef8d8500349d7b33910f53edbb676fa67.diff

LOG: [AMDGPU] Enable base pointer.

When the callee requires a dynamic stack realignment,
it is not possible to correcty access the incoming
stack arguments using the stack pointer. We reserve a
base pointer in such cases to access the function arguments
inside the callee. The base pointer will hold the incoming
stack pointer value before any kind of delta added to it.

Reviewed By: arsenm, scott.linder

Differential Revision: https://reviews.llvm.org/D78811

Added: 
    

Modified: 
    llvm/docs/AMDGPUUsage.rst
    llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
    llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
    llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/lib/Target/AMDGPU/SIRegisterInfo.h
    llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
    llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
    llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
    llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
    llvm/test/CodeGen/AMDGPU/stack-realign.ll

Removed: 
    


################################################################################
diff  --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 01e0c65f98b8..976fd4894a04 100644

--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -6717,16 +6717,13 @@ describes how the AMDGPU implements function calls:
 1.  SGPR33 is used as a frame pointer (FP) if necessary. Like the SP it is an
     unswizzled scratch address. It is only needed if runtime sized ``alloca``
     are used, or for the reasons defined in ``SIFrameLowering``.
-2.  Runtime stack alignment is not currently supported.
+2.  Runtime stack alignment is supported. SGPR34 is used as a base pointer (BP)
+    to access the incoming stack arguments in the function. The BP is needed
+    only when the function requires the runtime stack alignment.
 
-    .. TODO::
-
-      - If runtime stack alignment is supported, then will an extra argument
-        pointer register be used?
-
-2.  Allocating SGPR arguments on the stack are not supported.
+3.  Allocating SGPR arguments on the stack are not supported.
 
-3.  No CFI is currently generated. See
+4.  No CFI is currently generated. See
     :ref:`amdgpu-dwarf-call-frame-information`.
 
     ..note::
@@ -6745,12 +6742,12 @@ describes how the AMDGPU implements function calls:
       local variables and register spill slots are accessed as positive offsets
       relative to ``DW_AT_frame_base``.
 
-4.  Function argument passing is implemented by copying the input physical
+5.  Function argument passing is implemented by copying the input physical
     registers to virtual registers on entry. The register allocator can spill if
     necessary. These are copied back to physical registers at call sites. The
     net effect is that each function call can have these values in entirely
     distinct locations. The IPRA can help avoid shuffling argument registers.
-5.  Call sites are implemented by setting up the arguments at positive offsets
+6.  Call sites are implemented by setting up the arguments at positive offsets
     from SP. Then SP is incremented to account for the known frame size before
     the call and decremented after the call.
 
@@ -6759,7 +6756,7 @@ describes how the AMDGPU implements function calls:
       The CFI will reflect the changed calculation needed to compute the CFA
       from SP.
 
-6.  4 byte spill slots are used in the stack frame. One slot is allocated for an
+7.  4 byte spill slots are used in the stack frame. One slot is allocated for an
     emergency spill slot. Buffer instructions are used for stack accesses and
     not the ``flat_scratch`` instruction.
 

diff  --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index f4db9d249889..96da89d82605 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -78,11 +78,64 @@ static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
   return MCRegister();
 }
 
-static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
-  LivePhysRegs LiveRegs;
-  LiveRegs.init(*MRI.getTargetRegisterInfo());
-  return findScratchNonCalleeSaveRegister(
-    MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
+static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
+                                           LivePhysRegs &LiveRegs,
+                                           Register &TempSGPR,
+                                           Optional<int> &FrameIndex,
+                                           bool IsFP) {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+
+#ifndef NDEBUG
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+#endif
+
+  // We need to save and restore the current FP/BP.
+
+  // 1: If there is already a VGPR with free lanes, use it. We
+  // may already have to pay the penalty for spilling a CSR VGPR.
+  if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
+    int NewFI = FrameInfo.CreateStackObject(4, 4, true, nullptr,
+                                            TargetStackID::SGPRSpill);
+
+    if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
+      llvm_unreachable("allocate SGPR spill should have worked");
+
+    FrameIndex = NewFI;
+
+    LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+               dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to  "
+                      << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
+                      << '\n');
+    return;
+  }
+
+  // 2: Next, try to save the FP/BP in an unused SGPR.
+  TempSGPR = findScratchNonCalleeSaveRegister(
+      MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
+
+  if (!TempSGPR) {
+    int NewFI = FrameInfo.CreateStackObject(4, 4, true, nullptr,
+                                            TargetStackID::SGPRSpill);
+
+    if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
+      // 3: There's no free lane to spill, and no free register to save FP/BP,
+      // so we're forced to spill another VGPR to use for the spill.
+      FrameIndex = NewFI;
+    } else {
+      // 4: If all else fails, spill the FP/BP to memory.
+      FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
+    }
+
+    LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+               dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
+                      << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
+                      << '\n';);
+  } else {
+    LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to "
+                      << printReg(TempSGPR, TRI) << '\n');
+  }
 }
 
 // We need to specially emit stack operations here because a 
diff erent frame
@@ -613,6 +666,9 @@ static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
       LiveRegs.addLiveIns(MBB);
       if (FuncInfo->SGPRForFPSaveRestoreCopy)
         LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+
+      if (FuncInfo->SGPRForBPSaveRestoreCopy)
+        LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy);
     } else {
       // In epilog.
       LiveRegs.init(*ST.getRegisterInfo());
@@ -650,12 +706,15 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
 
   Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
   Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+  Register BasePtrReg =
+      TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
   LivePhysRegs LiveRegs;
 
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc DL;
 
   bool HasFP = false;
+  bool HasBP = false;
   uint32_t NumBytes = MFI.getStackSize();
   uint32_t RoundedSize = NumBytes;
   // To avoid clobbering VGPRs in lanes that weren't active on function entry,
@@ -671,14 +730,46 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
                       TargetStackID::SGPRSpill;
   }
 
+  bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
+  bool SpillBPToMemory = false;
+  // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
+  // Otherwise we are spilling the BP to memory.
+  if (HasBPSaveIndex) {
+    SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
+                      TargetStackID::SGPRSpill;
+  }
+
   // Emit the copy if we need an FP, and are using a free SGPR to save it.
   if (FuncInfo->SGPRForFPSaveRestoreCopy) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
       .addReg(FramePtrReg)
       .setMIFlag(MachineInstr::FrameSetup);
-    // Make the register live throughout the function.
-    for (MachineBasicBlock &MBB : MF)
-      MBB.addLiveIn(FuncInfo->SGPRForFPSaveRestoreCopy);
+  }
+
+  // Emit the copy if we need a BP, and are using a free SGPR to save it.
+  if (FuncInfo->SGPRForBPSaveRestoreCopy) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
+            FuncInfo->SGPRForBPSaveRestoreCopy)
+        .addReg(BasePtrReg)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // If a copy has been emitted for FP and/or BP, Make the SGPRs
+  // used in the copy instructions live throughout the function.
+  SmallVector<MCPhysReg, 2> TempSGPRs;
+  if (FuncInfo->SGPRForFPSaveRestoreCopy)
+    TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy);
+
+  if (FuncInfo->SGPRForBPSaveRestoreCopy)
+    TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy);
+
+  if (!TempSGPRs.empty()) {
+    for (MachineBasicBlock &MBB : MF) {
+      for (MCPhysReg Reg : TempSGPRs)
+        MBB.addLiveIn(Reg);
+
+      MBB.sortUniqueLiveIns();
+    }
   }
 
   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
@@ -712,6 +803,23 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
                      FuncInfo->FramePointerSaveIndex.getValue());
   }
 
+  if (HasBPSaveIndex && SpillBPToMemory) {
+    assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex));
+
+    if (!ScratchExecCopy)
+      ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
+
+    MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
+        MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+        .addReg(BasePtrReg);
+
+    buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
+                     FuncInfo->getScratchRSrcReg(), StackPtrReg,
+                     *FuncInfo->BasePointerSaveIndex);
+  }
+
   if (ScratchExecCopy) {
     // FIXME: Split block and make terminator.
     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
@@ -740,6 +848,25 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
         .addReg(Spill[0].VGPR, RegState::Undef);
   }
 
+  // In this case, spill the BP to a reserved VGPR.
+  if (HasBPSaveIndex && !SpillBPToMemory) {
+    const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+    assert(!MFI.isDeadObjectIndex(BasePtrFI));
+
+    assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
+    ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+        FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
+    assert(Spill.size() == 1);
+
+    // Save BP before setting it up.
+    // FIXME: This should respect spillSGPRToVGPR;
+    BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+            Spill[0].VGPR)
+        .addReg(BasePtrReg)
+        .addImm(Spill[0].Lane)
+        .addReg(Spill[0].VGPR, RegState::Undef);
+  }
+
   if (TRI.needsStackRealignment(MF)) {
     HasFP = true;
     const unsigned Alignment = MFI.getMaxAlign().value();
@@ -749,11 +876,13 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
       LiveRegs.init(TRI);
       LiveRegs.addLiveIns(MBB);
       LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+      LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
     }
 
     Register ScratchSPReg = findScratchNonCalleeSaveRegister(
         MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
-    assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
+    assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy &&
+           ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy);
 
     // s_add_u32 tmp_reg, s32, NumBytes
     // s_and_b32 s32, tmp_reg, 0b111...0000
@@ -767,15 +896,21 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
     FuncInfo->setIsStackRealigned(true);
   } else if ((HasFP = hasFP(MF))) {
-    // If we need a base pointer, set it up here. It's whatever the value of
-    // the stack pointer is at this point. Any variable size objects will be
-    // allocated after this, so we can still use the base pointer to reference
-    // locals.
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
         .addReg(StackPtrReg)
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  // If we need a base pointer, set it up here. It's whatever the value of
+  // the stack pointer is at this point. Any variable size objects will be
+  // allocated after this, so we can still use the base pointer to reference
+  // the incoming arguments.
+  if ((HasBP = TRI.hasBasePointer(MF))) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
+        .addReg(StackPtrReg)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
   if (HasFP && RoundedSize != 0) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
         .addReg(StackPtrReg)
@@ -790,6 +925,14 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
                     !FuncInfo->FramePointerSaveIndex)) &&
          "Saved FP but didn't need it");
+
+  assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy ||
+                     FuncInfo->BasePointerSaveIndex)) &&
+         "Needed to save BP but didn't save it anywhere");
+
+  assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy &&
+                    !FuncInfo->BasePointerSaveIndex)) &&
+         "Saved BP but didn't need it");
 }
 
 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -801,6 +944,7 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
   LivePhysRegs LiveRegs;
   DebugLoc DL;
@@ -812,6 +956,8 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
                              : NumBytes;
   const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
   const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+  const Register BasePtrReg =
+      TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
 
   bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
   bool SpillFPToMemory = false;
@@ -820,6 +966,13 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
                       TargetStackID::SGPRSpill;
   }
 
+  bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
+  bool SpillBPToMemory = false;
+  if (HasBPSaveIndex) {
+    SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
+                      TargetStackID::SGPRSpill;
+  }
+
   if (RoundedSize != 0 && hasFP(MF)) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
       .addReg(StackPtrReg)
@@ -833,6 +986,12 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  if (FuncInfo->SGPRForBPSaveRestoreCopy) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
+        .addReg(FuncInfo->SGPRForBPSaveRestoreCopy)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
   Register ScratchExecCopy;
   if (HasFPSaveIndex) {
     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
@@ -860,6 +1019,32 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
     }
   }
 
+  if (HasBPSaveIndex) {
+    const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+    assert(!MFI.isDeadObjectIndex(BasePtrFI));
+    if (SpillBPToMemory) {
+      if (!ScratchExecCopy)
+        ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
+
+      MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
+          MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+      buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
+                        FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
+      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
+          .addReg(TempVGPR, RegState::Kill);
+    } else {
+      // Reload from VGPR spill.
+      assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
+      ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+          FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
+      assert(Spill.size() == 1);
+      BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+              BasePtrReg)
+          .addReg(Spill[0].VGPR)
+          .addImm(Spill[0].Lane);
+    }
+  }
+
   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg :
        FuncInfo->getSGPRSpillVGPRs()) {
     if (!Reg.FI.hasValue())
@@ -896,12 +1081,14 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
 
 #ifndef NDEBUG
 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
-                                 Optional<int> FramePointerSaveIndex) {
+                                 Optional<int> FramePointerSaveIndex,
+                                 Optional<int> BasePointerSaveIndex) {
   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
        I != E; ++I) {
     if (!MFI.isDeadObjectIndex(I) &&
         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
-        FramePointerSaveIndex && I != FramePointerSaveIndex) {
+        ((FramePointerSaveIndex && I != FramePointerSaveIndex) ||
+         (BasePointerSaveIndex && I != BasePointerSaveIndex))) {
       return false;
     }
   }
@@ -928,7 +1115,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
 
   FuncInfo->removeDeadFrameIndices(MFI);
-  assert(allSGPRSpillsAreDead(MFI, None) &&
+  assert(allSGPRSpillsAreDead(MFI, None, None) &&
          "SGPR spill should have been removed in SILowerSGPRSpills");
 
   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
@@ -984,54 +1171,19 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   for (auto SSpill : MFI->getSGPRSpillVGPRs())
     SavedVGPRs.reset(SSpill.VGPR);
 
-  const bool HasFP = WillHaveFP || hasFP(MF);
-  if (!HasFP)
-    return;
-
-  // We need to save and restore the current FP.
-
-  // 1: If there is already a VGPR with free lanes, use it. We
-  // may already have to pay the penalty for spilling a CSR VGPR.
-  if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
-    int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
-                                                    TargetStackID::SGPRSpill);
-
-    if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
-      llvm_unreachable("allocate SGPR spill should have worked");
-
-    MFI->FramePointerSaveIndex = NewFI;
+  LivePhysRegs LiveRegs;
+  LiveRegs.init(*TRI);
 
-    LLVM_DEBUG(
-      auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
-      dbgs() << "Spilling FP to  " << printReg(Spill.VGPR, TRI)
-             << ':' << Spill.Lane << '\n');
-    return;
+  if (WillHaveFP || hasFP(MF)) {
+    getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy,
+                                   MFI->FramePointerSaveIndex, true);
   }
 
-  // 2: Next, try to save the FP in an unused SGPR.
-  MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
-
-  if (!MFI->SGPRForFPSaveRestoreCopy) {
-    int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
-                                                    TargetStackID::SGPRSpill);
-
-    if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
-      // 3: There's no free lane to spill, and no free register to save FP, so
-      // we're forced to spill another VGPR to use for the spill.
-      MFI->FramePointerSaveIndex = NewFI;
-    } else {
-      // 4: If all else fails, spill the FP to memory.
-      MFI->FramePointerSaveIndex =
-          FrameInfo.CreateSpillStackObject(4, Align(4));
-    }
-
-    LLVM_DEBUG(
-      auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
-      dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
-             << ':' << Spill.Lane << '\n';);
-  } else {
-    LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
-               printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
+  if (TRI->hasBasePointer(MF)) {
+    if (MFI->SGPRForFPSaveRestoreCopy)
+      LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy);
+    getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
+                                   MFI->BasePointerSaveIndex, false);
   }
 }
 
@@ -1058,14 +1210,31 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots(
     return true; // Early exit if no callee saved registers are modified!
 
   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-  if (!FuncInfo->SGPRForFPSaveRestoreCopy)
+  if (!FuncInfo->SGPRForFPSaveRestoreCopy &&
+      !FuncInfo->SGPRForBPSaveRestoreCopy)
     return false;
 
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *RI = ST.getRegisterInfo();
+  Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+  Register BasePtrReg = RI->getBaseRegister();
+  unsigned NumModifiedRegs = 0;
+
+  if (FuncInfo->SGPRForFPSaveRestoreCopy)
+    NumModifiedRegs++;
+  if (FuncInfo->SGPRForBPSaveRestoreCopy)
+    NumModifiedRegs++;
+
   for (auto &CS : CSI) {
-    if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
-      if (FuncInfo->SGPRForFPSaveRestoreCopy)
-        CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
-      break;
+    if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) {
+      CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+      if (--NumModifiedRegs)
+        break;
+    } else if (CS.getReg() == BasePtrReg &&
+               FuncInfo->SGPRForBPSaveRestoreCopy) {
+      CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy);
+      if (--NumModifiedRegs)
+        break;
     }
   }
 

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 8f25ebd3437e..06681471bf90 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -426,9 +426,9 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
 }
 
 void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
-  // The FP spill hasn't been inserted yet, so keep it around.
+  // The FP & BP spills haven't been inserted yet, so keep them around.
   for (auto &R : SGPRToVGPRSpills) {
-    if (R.first != FramePointerSaveIndex)
+    if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex)
       MFI.RemoveStackObject(R.first);
   }
 
@@ -436,7 +436,7 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
   // ID.
   for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e;
        ++i)
-    if (i != FramePointerSaveIndex)
+    if (i != FramePointerSaveIndex && i != BasePointerSaveIndex)
       MFI.setStackID(i, TargetStackID::Default);
 
   for (auto &R : VGPRToAGPRSpills) {

diff  --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 0d85136e23e0..7221e0157522 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -485,6 +485,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   Register SGPRForFPSaveRestoreCopy;
   Optional<int> FramePointerSaveIndex;
 
+  /// If this is set, an SGPR used for save/restore of the register used for the
+  /// base pointer.
+  Register SGPRForBPSaveRestoreCopy;
+  Optional<int> BasePointerSaveIndex;
+
   Register VGPRReservedForSGPRSpill;
   bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg);
 

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index f528b804b1d1..255a1642fb63 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -112,6 +112,15 @@ Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
                         : FuncInfo->getStackPtrOffsetReg();
 }
 
+bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  // When we need stack realignment, we can't reference off of the
+  // stack pointer, so we reserve a base pointer.
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return MFI.getNumFixedObjects() && needsStackRealignment(MF);
+}
+
+Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
+
 const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
   return CSR_AMDGPU_AllVGPRs_RegMask;
 }
@@ -309,6 +318,12 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     assert(!isSubRegister(ScratchRSrcReg, FrameReg));
   }
 
+  if (hasBasePointer(MF)) {
+    MCRegister BasePtrReg = getBaseRegister();
+    reserveRegisterTuples(Reserved, BasePtrReg);
+    assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
+  }
+
   for (MCRegister Reg : MFI->WWMReservedRegs) {
     reserveRegisterTuples(Reserved, Reg);
   }
@@ -1058,7 +1073,9 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   MachineOperand &FIOp = MI->getOperand(FIOperandNum);
   int Index = MI->getOperand(FIOperandNum).getIndex();
 
-  Register FrameReg = getFrameRegister(*MF);
+  Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
+                          ? getBaseRegister()
+                          : getFrameRegister(*MF);
 
   switch (MI->getOpcode()) {
     // SGPR register spill

diff  --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index d8c3ce584453..55ea00a246a1 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -65,6 +65,9 @@ class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
 
   Register getFrameRegister(const MachineFunction &MF) const override;
 
+  bool hasBasePointer(const MachineFunction &MF) const;
+  Register getBaseRegister() const;
+
   bool canRealignStack(const MachineFunction &MF) const override;
   bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
 

diff  --git a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
index 0f5ad75da555..46a390327a7d 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
+++ b/llvm/test/CodeGen/AMDGPU/fix-frame-ptr-reg-copy-livein.ll
@@ -10,17 +10,17 @@
 define i32 @fp_save_restore_in_temp_sgpr(%struct.Data addrspace(5)* nocapture readonly byval(%struct.Data) align 4 %arg) #0 {
   ; GCN-LABEL: name: fp_save_restore_in_temp_sgpr
   ; GCN: bb.0.begin:
-  ; GCN:   liveins: $sgpr30_sgpr31, $sgpr7
+  ; GCN:   liveins: $sgpr7, $sgpr30_sgpr31
   ; GCN:   $sgpr7 = frame-setup COPY $sgpr33
   ; GCN:   $sgpr33 = frame-setup COPY $sgpr32
   ; GCN: bb.1.lp_end:
-  ; GCN:   liveins: $sgpr6, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31, $sgpr7
+  ; GCN:   liveins: $sgpr6, $sgpr7, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GCN: bb.2.lp_begin:
-  ; GCN:   liveins: $sgpr6, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr30_sgpr31, $sgpr7
+  ; GCN:   liveins: $sgpr6, $sgpr7, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr30_sgpr31
   ; GCN: bb.3.Flow:
-  ; GCN:   liveins: $sgpr6, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31, $sgpr7
+  ; GCN:   liveins: $sgpr6, $sgpr7, $vgpr0, $vgpr1, $sgpr4_sgpr5, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr30_sgpr31
   ; GCN: bb.4.end:
-  ; GCN:   liveins: $vgpr0, $sgpr4_sgpr5, $sgpr30_sgpr31, $sgpr7
+  ; GCN:   liveins: $sgpr7, $vgpr0, $sgpr4_sgpr5, $sgpr30_sgpr31
   ; GCN:   $sgpr33 = frame-setup COPY $sgpr7
 begin:
   br label %lp_begin

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
index f22acffb5909..cbb5fa2b68e0 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-carry-out.mir
@@ -27,7 +27,7 @@ body:             |
     liveins: $vgpr1
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs
-    ; CHECK: liveins: $vgpr1
+    ; CHECK: liveins: $sgpr27, $vgpr1
     ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
     ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
     ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
@@ -69,7 +69,7 @@ body:             |
     liveins: $vgpr1
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr
-    ; CHECK: liveins: $vgpr1
+    ; CHECK: liveins: $sgpr27, $vgpr1
     ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
     ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
     ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
@@ -109,7 +109,7 @@ body:             |
     liveins: $vgpr1
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei_one_sgpr_64
-    ; CHECK: liveins: $vgpr1
+    ; CHECK: liveins: $sgpr27, $vgpr1
     ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
     ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
     ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc
@@ -148,7 +148,7 @@ body:             |
     liveins: $vgpr1
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei_prefer_vcc
-    ; CHECK: liveins: $vgpr1
+    ; CHECK: liveins: $sgpr27, $vgpr1
     ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
     ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
     ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
index ec3c0b7042fe..4f47ce4745de 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr-gfx9.mir
@@ -22,7 +22,7 @@ body:             |
     liveins: $vgpr1
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei_no_sgprs
-    ; CHECK: liveins: $vgpr1
+    ; CHECK: liveins: $sgpr27, $vgpr1
     ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
     ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 524224, implicit-def $scc
     ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294443008, implicit-def $scc

diff  --git a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
index 2fbc51f036f0..ae1a921f02c2 100644
--- a/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/pei-scavenge-sgpr.mir
@@ -22,7 +22,7 @@ body:             |
     liveins: $vgpr1
 
     ; CHECK-LABEL: name: scavenge_sgpr_pei
-    ; CHECK: liveins: $vgpr1
+    ; CHECK: liveins: $sgpr27, $vgpr1
     ; CHECK: $sgpr27 = frame-setup COPY $sgpr33
     ; CHECK: $sgpr4 = frame-setup S_ADD_U32 $sgpr32, 262080, implicit-def $scc
     ; CHECK: $sgpr33 = frame-setup S_AND_B32 killed $sgpr4, 4294705152, implicit-def $scc

diff  --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
index 2a3cfe7a0992..8b7557d5deb8 100644
--- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll
+++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll
@@ -148,7 +148,114 @@ define void @disable_realign_align128(i32 %idx) #3 {
   ret void
 }
 
+declare void @extern_func(<32 x i32>, i32) #0
+define void @func_call_align1024_bp_gets_vgpr_spill(<32 x i32> %a, i32 %b) #0 {
+; The test forces the stack to be realigned to a new boundary
+; since there is a local object with an alignment of 1024.
+; Should use BP to access the incoming stack arguments.
+; The BP value is saved/restored with a VGPR spill.
+
+; GCN-LABEL: func_call_align1024_bp_gets_vgpr_spill:
+; GCN: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Spill
+; GCN-NEXT: s_mov_b64 exec, s[4:5]
+; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s33, 2
+; GCN-NEXT: v_writelane_b32 [[VGPR_REG]], s34, 3
+; GCN: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
+; GCN: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
+
+; GCN: s_mov_b32 s34, s32
+; GCN-NEXT: v_mov_b32_e32 v32, 0
+
+; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024
+; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
+; GCN-NEXT: s_add_u32 s32, s32, 0x30000
+
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32
+; GCN-NEXT: s_swappc_b64 s[30:31], s[4:5]
+
+; GCN: v_readlane_b32 s33, [[VGPR_REG]], 2
+; GCN-NEXT: s_sub_u32 s32, s32, 0x30000
+; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3
+; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1
+; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s32 offset:1028 ; 4-byte Folded Reload
+; GCN-NEXT: s_mov_b64 exec, s[6:7]
+  %temp = alloca i32, align 1024, addrspace(5)
+  store volatile i32 0, i32 addrspace(5)* %temp, align 1024
+  call void @extern_func(<32 x i32> %a, i32 %b)
+  ret void
+}
+
+%struct.Data = type { [9 x i32] }
+define i32 @needs_align1024_stack_args_used_inside_loop(%struct.Data addrspace(5)* nocapture readonly byval(%struct.Data) align 8 %arg) local_unnamed_addr #4 {
+; The local object allocation needed an alignment of 1024.
+; Since the function argument is accessed in a loop with an
+; index variable, the base pointer first get loaded into a VGPR
+; and that value should be further referenced to load the incoming values.
+; The BP value will get saved/restored in an SGPR at the prolgoue/epilogue.
+
+; GCN-LABEL: needs_align1024_stack_args_used_inside_loop:
+; GCN: s_mov_b32 [[BP_COPY:s[0-9]+]], s34
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: s_add_u32 [[SCRATCH_REG:s[0-9]+]], s32, 0xffc0
+; GCN-NEXT: s_mov_b32 [[FP_COPY:s[0-9]+]], s33
+; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000
+; GCN-NEXT: v_mov_b32_e32 v{{[0-9]+}}, 0
+; GCN-NEXT: v_lshrrev_b32_e64 [[VGPR_REG:v[0-9]+]], 6, s34
+; GCN: s_add_u32 s32, s32, 0x30000
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:1024
+; GCN: buffer_load_dword v{{[0-9]+}}, [[VGPR_REG]], s[0:3], 0 offen
+; GCN: v_add_u32_e32 [[VGPR_REG]], vcc, 4, [[VGPR_REG]]
+; GCN: s_sub_u32 s32, s32, 0x30000
+; GCN-NEXT: s_mov_b32 s33, [[FP_COPY]]
+; GCN-NEXT: s_mov_b32 s34, [[BP_COPY]]
+; GCN-NEXT: s_setpc_b64 s[30:31]
+begin:
+  %local_var = alloca i32, align 1024, addrspace(5)
+  store volatile i32 0, i32 addrspace(5)* %local_var, align 1024
+  br label %loop_body
+
+loop_end:                                                ; preds = %loop_body
+  %idx_next = add nuw nsw i32 %lp_idx, 1
+  %lp_exit_cond = icmp eq i32 %idx_next, 9
+  br i1 %lp_exit_cond, label %exit, label %loop_body
+
+loop_body:                                                ; preds = %loop_end, %begin
+  %lp_idx = phi i32 [ 0, %begin ], [ %idx_next, %loop_end ]
+  %ptr = getelementptr inbounds %struct.Data, %struct.Data addrspace(5)* %arg, i32 0, i32 0, i32 %lp_idx
+  %val = load i32, i32 addrspace(5)* %ptr, align 8
+  %lp_cond = icmp eq i32 %val, %lp_idx
+  br i1 %lp_cond, label %loop_end, label %exit
+
+exit:                                               ; preds = %loop_end, %loop_body
+  %out = phi i32 [ 0, %loop_body ], [ 1, %loop_end ]
+  ret i32 %out
+}
+
+define void @no_free_scratch_sgpr_for_bp_copy(<32 x i32> %a, i32 %b) #0 {
+; GCN-LABEL: no_free_scratch_sgpr_for_bp_copy:
+; GCN: ; %bb.0:
+; GCN: v_writelane_b32 [[VGPR_REG:v[0-9]+]], s34, 0
+; GCN-NEXT: s_mov_b32 s34, s32
+; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:128
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: ;;#ASMEND
+; GCN: v_readlane_b32 s34, [[VGPR_REG:v[0-9]+]], 0
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+  %local_val = alloca i32, align 128, addrspace(5)
+  store volatile i32 %b, i32 addrspace(5)* %local_val, align 128
+  ; Use all clobberable registers, so BP has to spill to a VGPR.
+  call void asm sideeffect "",
+    "~{s0},~{s1},~{s2},~{s3},~{s4},~{s5},~{s6},~{s7},~{s8},~{s9}
+    ,~{s10},~{s11},~{s12},~{s13},~{s14},~{s15},~{s16},~{s17},~{s18},~{s19}
+    ,~{s20},~{s21},~{s22},~{s23},~{s24},~{s25},~{s26},~{s27},~{s28},~{s29}
+    ,~{vcc_hi}"() #0
+  ret void
+}
+
 attributes #0 = { noinline nounwind }
 attributes #1 = { noinline nounwind "stackrealign" }
 attributes #2 = { noinline nounwind alignstack=4 }
 attributes #3 = { noinline nounwind "no-realign-stack" }
+attributes #4 = { noinline nounwind "frame-pointer"="all"}