[llvm] r285435 - AMDGPU: Fix using incorrect private resource with no allocation

Fri Oct 28 12:43:31 PDT 2016

Author: arsenm
Date: Fri Oct 28 14:43:31 2016
New Revision: 285435

URL: http://llvm.org/viewvc/llvm-project?rev=285435&view=rev
Log:
AMDGPU: Fix using incorrect private resource with no allocation

It's possible to have a use of the private resource descriptor or
scratch wave offset registers even though there are no allocated
stack objects. This would result in continuing to use the maximum
number reserved registers. This could go over the number of SGPRs
available on VI, or violate the SGPR limit requested by
the function attributes.

Modified:
    llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
    llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
    llvm/trunk/test/CodeGen/AMDGPU/private-access-no-objects.ll
    llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll
    llvm/trunk/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
    llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll
    llvm/trunk/test/CodeGen/AMDGPU/wqm.ll

Modified: llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp?rev=285435&r1=285434&r2=285435&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp Fri Oct 28 14:43:31 2016
@@ -21,12 +21,6 @@
 using namespace llvm;
 
 
-static bool hasOnlySGPRSpills(const SIMachineFunctionInfo *FuncInfo,
-                              const MachineFrameInfo &MFI) {
-  return FuncInfo->hasSpilledSGPRs() &&
-    (!FuncInfo->hasSpilledVGPRs() && !FuncInfo->hasNonSpillStackObjects());
-}
-
 static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF,
                                          const SIRegisterInfo *TRI) {
   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
@@ -75,7 +69,6 @@ void SIFrameLowering::emitFlatScratchIni
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
 
-
   // Add wave offset in bytes to private base offset.
   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
@@ -97,7 +90,8 @@ unsigned SIFrameLowering::getReservedPri
 
   // We need to insert initialization of the scratch resource descriptor.
   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
-  assert(ScratchRsrcReg != AMDGPU::NoRegister);
+  if (ScratchRsrcReg == AMDGPU::NoRegister)
+    return AMDGPU::NoRegister;
 
   if (ST.hasSGPRInitBug() ||
       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
@@ -116,14 +110,17 @@ unsigned SIFrameLowering::getReservedPri
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  unsigned NumPreloaded = MFI->getNumPreloadedSGPRs() / 4;
+  unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
+  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI);
+  AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
+
   // Skip the last 2 elements because the last one is reserved for VCC, and
   // this is the 2nd to last element already.
-  for (MCPhysReg Reg : getAllSGPR128(MF, TRI).drop_back(2).slice(NumPreloaded)) {
+  for (MCPhysReg Reg : AllSGPR128s) {
     // Pick the first unallocated one. Make sure we don't clobber the other
     // reserved input we needed.
-    if (!MRI.isPhysRegUsed(Reg)) {
-      assert(MRI.isAllocatable(Reg));
+    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+      //assert(MRI.isAllocatable(Reg));
       MRI.replaceRegWith(ScratchRsrcReg, Reg);
       MFI->setScratchRSrcReg(Reg);
       return Reg;
@@ -146,8 +143,15 @@ unsigned SIFrameLowering::getReservedPri
 
   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
   MachineRegisterInfo &MRI = MF.getRegInfo();
+
   unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
 
+  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI);
+  if (NumPreloaded > AllSGPRs.size())
+    return ScratchWaveOffsetReg;
+
+  AllSGPRs = AllSGPRs.slice(NumPreloaded);
+
   // We need to drop register from the end of the list that we cannot use
   // for the scratch wave offset.
   // + 2 s102 and s103 do not exist on VI.
@@ -161,7 +165,10 @@ unsigned SIFrameLowering::getReservedPri
   //     are no other free SGPRs, then the value will stay in this register.
   // ----
   //  13
-  for (MCPhysReg Reg : getAllSGPRs(MF, TRI).drop_back(13).slice(NumPreloaded)) {
+  if (AllSGPRs.size() < 13)
+    return ScratchWaveOffsetReg;
+
+  for (MCPhysReg Reg : AllSGPRs.drop_back(13)) {
     // Pick the first unallocated SGPR. Be careful not to pick an alias of the
     // scratch descriptor, since we havenât added its uses yet.
     if (!MRI.isPhysRegUsed(Reg)) {
@@ -186,9 +193,6 @@ void SIFrameLowering::emitPrologue(Machi
   if (ST.debuggerEmitPrologue())
     emitDebuggerPrologue(MF, MBB);
 
-  if (!MF.getFrameInfo().hasStackObjects())
-    return;
-
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
 
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
@@ -198,8 +202,6 @@ void SIFrameLowering::emitPrologue(Machi
   //
   // FIXME: We should be cleaning up these unused SGPR spill frame indices
   // somewhere.
-  if (hasOnlySGPRSpills(MFI, MF.getFrameInfo()))
-    return;
 
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
@@ -209,38 +211,51 @@ void SIFrameLowering::emitPrologue(Machi
     = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
   unsigned ScratchWaveOffsetReg
     = getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
-  assert(ScratchRsrcReg != AMDGPU::NoRegister);
-  assert(ScratchWaveOffsetReg != AMDGPU::NoRegister);
+
+  if (ScratchRsrcReg == AMDGPU::NoRegister) {
+    assert(ScratchWaveOffsetReg == AMDGPU::NoRegister);
+    return;
+  }
+
   assert(!TRI->isSubRegister(ScratchRsrcReg, ScratchWaveOffsetReg));
 
-  if (MFI->hasFlatScratchInit())
+  // We need to do the replacement of the private segment buffer and wave offset
+  // register even if there are no stack objects. There could be stores to undef
+  // or a constant without an associated object.
+
+  // FIXME: We still have implicit uses on SGPR spill instructions in case they
+  // need to spill to vector memory. It's likely that will not happen, but at
+  // this point it appears we need the setup. This part of the prolog should be
+  // emitted after frame indices are eliminated.
+
+  if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
     emitFlatScratchInit(TII, TRI, MF, MBB);
 
   // We need to insert initialization of the scratch resource descriptor.
   unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
     MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
 
+
   unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
   if (ST.isAmdCodeObjectV2()) {
     PreloadedPrivateBufferReg = TRI->getPreloadedValue(
       MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
   }
 
-  // If we reserved the original input registers, we don't need to copy to the
-  // reserved registers.
-  if (ScratchRsrcReg == PreloadedPrivateBufferReg) {
-    // We should always reserve these 5 registers at the same time.
-    assert(ScratchWaveOffsetReg == PreloadedScratchWaveOffsetReg &&
-           "scratch wave offset and private segment buffer inconsistent");
-    return;
-  }
+  bool OffsetRegUsed = !MRI.use_empty(ScratchWaveOffsetReg);
+  bool ResourceRegUsed = !MRI.use_empty(ScratchRsrcReg);
 
   // We added live-ins during argument lowering, but since they were not used
   // they were deleted. We're adding the uses now, so add them back.
-  MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
-  MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+  if (OffsetRegUsed) {
+    assert(PreloadedScratchWaveOffsetReg != AMDGPU::NoRegister &&
+           "scratch wave offset input is required");
+    MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+    MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
+  }
 
-  if (ST.isAmdCodeObjectV2()) {
+  if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
+    assert(ST.isAmdCodeObjectV2());
     MRI.addLiveIn(PreloadedPrivateBufferReg);
     MBB.addLiveIn(PreloadedPrivateBufferReg);
   }
@@ -250,30 +265,46 @@ void SIFrameLowering::emitPrologue(Machi
     if (&OtherBB == &MBB)
       continue;
 
-    OtherBB.addLiveIn(ScratchRsrcReg);
-    OtherBB.addLiveIn(ScratchWaveOffsetReg);
+    if (OffsetRegUsed)
+      OtherBB.addLiveIn(ScratchWaveOffsetReg);
+
+    if (ResourceRegUsed)
+      OtherBB.addLiveIn(ScratchRsrcReg);
   }
 
   DebugLoc DL;
   MachineBasicBlock::iterator I = MBB.begin();
 
-  if (PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
-    // Make sure we emit the copy for the offset first. We may have chosen to
-    // copy the buffer resource into a register that aliases the input offset
-    // register.
+  // If we reserved the original input registers, we don't need to copy to the
+  // reserved registers.
+
+  bool CopyBuffer = ResourceRegUsed &&
+    PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
+    ScratchRsrcReg != PreloadedPrivateBufferReg;
+
+  // This needs to be careful of the copying order to avoid overwriting one of
+  // the input registers before it's been copied to it's final
+  // destination. Usually the offset should be copied first.
+  bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
+                                              ScratchWaveOffsetReg);
+  if (CopyBuffer && CopyBufferFirst) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+      .addReg(PreloadedPrivateBufferReg, RegState::Kill);
+  }
+
+  if (OffsetRegUsed &&
+      PreloadedScratchWaveOffsetReg != ScratchWaveOffsetReg) {
     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
       .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
   }
 
-  if (ST.isAmdCodeObjectV2()) {
-    // Insert copies from argument register.
-    assert(
-      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchRsrcReg) &&
-      !TRI->isSubRegisterEq(PreloadedPrivateBufferReg, ScratchWaveOffsetReg));
-
+  if (CopyBuffer && !CopyBufferFirst) {
     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
       .addReg(PreloadedPrivateBufferReg, RegState::Kill);
-  } else {
+  }
+
+  if (ResourceRegUsed && PreloadedPrivateBufferReg == AMDGPU::NoRegister) {
+    assert(!ST.isAmdCodeObjectV2());
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 
     unsigned Rsrc0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=285435&r1=285434&r2=285435&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Fri Oct 28 14:43:31 2016
@@ -614,7 +614,12 @@ void SIInstrInfo::storeRegToStackSlot(Ma
     BuildMI(MBB, MI, DL, OpDesc)
       .addReg(SrcReg, getKillRegState(isKill)) // data
       .addFrameIndex(FrameIndex)               // addr
-      .addMemOperand(MMO);
+      .addMemOperand(MMO)
+      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+      .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
+    // Add the scratch resource registers as implicit uses because we may end up
+    // needing them, and need to ensure that the reserved registers are
+    // correctly handled.
 
     return;
   }
@@ -707,7 +712,9 @@ void SIInstrInfo::loadRegFromStackSlot(M
 
     BuildMI(MBB, MI, DL, OpDesc, DestReg)
       .addFrameIndex(FrameIndex) // addr
-      .addMemOperand(MMO);
+      .addMemOperand(MMO)
+      .addReg(MFI->getScratchRSrcReg(), RegState::Implicit)
+      .addReg(MFI->getScratchWaveOffsetReg(), RegState::Implicit);
 
     return;
   }

Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp?rev=285435&r1=285434&r2=285435&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp Fri Oct 28 14:43:31 2016
@@ -1059,9 +1059,20 @@ unsigned SIRegisterInfo::getMaxNumSGPRs(
       F, "amdgpu-num-sgpr", MaxNumSGPRs);
 
     // Make sure requested value does not violate subtarget's specifications.
-    if (Requested && Requested <= getNumReservedSGPRs(ST))
+    if (Requested && (Requested <= getNumReservedSGPRs(ST)))
       Requested = 0;
 
+    // If more SGPRs are required to support the input user/system SGPRs,
+    // increase to accomodate them.
+    //
+    // FIXME: This really ends up using the requested number of SGPRs + number
+    // of reserved special registers in total. Theoretically you could re-use
+    // the last input registers for these special registers, but this would
+    // require a lot of complexity to deal with the weird aliasing.
+    unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs();
+    if (Requested && Requested < NumInputSGPRs)
+      Requested = NumInputSGPRs;
+
     // Make sure requested value is compatible with values implied by
     // default/requested minimum/maximum number of waves per execution unit.
     if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first))

Modified: llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll?rev=285435&r1=285434&r2=285435&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll Fri Oct 28 14:43:31 2016
@@ -1,9 +1,16 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck %s
 
-; CHECK-LABEL: {{^}}max_18_sgprs:
+; CHECK-LABEL: {{^}}max_14_sgprs:
+
+; FIXME: Should be ablo to skip this copying of the private segment
+; buffer because all the SGPR spills are to VGPRs.
+
+; CHECK: s_mov_b64 s[6:7], s[2:3]
+; CHECK: s_mov_b64 s[4:5], s[0:1]
+
 ; CHECK: SGPRBlocks: 1
-; CHECK: NumSGPRsForWavesPerEU: 13
-define void @max_18_sgprs(i32 addrspace(1)* %out1,
+; CHECK: NumSGPRsForWavesPerEU: 14
+define void @max_14_sgprs(i32 addrspace(1)* %out1,
                           i32 addrspace(1)* %out2,
                           i32 addrspace(1)* %out3,
                           i32 addrspace(1)* %out4,
@@ -14,4 +21,102 @@ define void @max_18_sgprs(i32 addrspace(
   store i32 %four, i32 addrspace(1)* %out4
   ret void
 }
-attributes #0 = {"amdgpu-num-sgpr"="18"}
+
+; private resource: 4
+; scratch wave offset: 1
+; workgroup ids: 3
+; dispatch id: 2
+; queue ptr: 2
+; flat scratch init: 2
+; ---------------------
+; total: 14
+
+; + reserved vcc, flat_scratch = 18
+
+; Because we can't handle re-using the last few input registers as the
+; special vcc etc. registers (as well as decide to not use the unused
+; features when the number of registers is frozen), this ends up using
+; more than expected.
+
+; ALL-LABEL: {{^}}max_12_sgprs_14_input_sgprs:
+; TOSGPR: SGPRBlocks: 2
+; TOSGPR: NumSGPRsForWavesPerEU: 18
+
+; TOSMEM: s_mov_b64 s[6:7], s[2:3]
+; TOSMEM: s_mov_b32 s9, s13
+; TOSMEM: s_mov_b64 s[4:5], s[0:1]
+
+; TOSMEM: SGPRBlocks: 2
+; TOSMEM: NumSGPRsForWavesPerEU: 18
+define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
+                                        i32 addrspace(1)* %out2,
+                                        i32 addrspace(1)* %out3,
+                                        i32 addrspace(1)* %out4,
+                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
+  store volatile i32 0, i32* undef
+  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
+  store volatile i64 %x.3, i64 addrspace(1)* undef
+  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
+  %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+  store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef
+
+  store i32 %one, i32 addrspace(1)* %out1
+  store i32 %two, i32 addrspace(1)* %out2
+  store i32 %three, i32 addrspace(1)* %out3
+  store i32 %four, i32 addrspace(1)* %out4
+  ret void
+}
+
+; ALL-LABEL: max_12_sgprs_12_input_sgprs{{$}}
+; ; Make sure copies for input buffer are not clobbered. This requires
+; ; swapping the order the registers are copied from what normally
+; ; happens.
+
+; TOSMEM: s_mov_b64 s[6:7], s[2:3]
+; TOSMEM: s_mov_b64 s[4:5], s[0:1]
+; TOSMEM: s_mov_b32 s3, s11
+
+; ALL: SGPRBlocks: 1
+; ALL: NumSGPRsForWavesPerEU: 16
+define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
+                                        i32 addrspace(1)* %out2,
+                                        i32 addrspace(1)* %out3,
+                                        i32 addrspace(1)* %out4,
+                                        i32 %one, i32 %two, i32 %three, i32 %four) #2 {
+  store volatile i32 0, i32* undef
+  %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  %x.3 = call i64 @llvm.amdgcn.dispatch.id()
+  store volatile i64 %x.3, i64 addrspace(1)* undef
+  %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
+  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
+
+  store i32 %one, i32 addrspace(1)* %out1
+  store i32 %two, i32 addrspace(1)* %out2
+  store i32 %three, i32 addrspace(1)* %out3
+  store i32 %four, i32 addrspace(1)* %out4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+declare i32 @llvm.amdgcn.workgroup.id.y() #1
+declare i32 @llvm.amdgcn.workgroup.id.z() #1
+declare i64 @llvm.amdgcn.dispatch.id() #1
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1
+declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #1
+
+attributes #0 = { nounwind "amdgpu-num-sgpr"="14" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "amdgpu-num-sgpr"="12" }
+attributes #3 = { nounwind "amdgpu-num-sgpr"="11" }

Modified: llvm/trunk/test/CodeGen/AMDGPU/private-access-no-objects.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/private-access-no-objects.ll?rev=285435&r1=285434&r2=285435&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/private-access-no-objects.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/private-access-no-objects.ll Fri Oct 28 14:43:31 2016
@@ -1,6 +1,17 @@
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPTNONE %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=OPT %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=iceland -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=OPT %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=OPTNONE %s
+
+; There are no stack objects, but still a private memory access. The
+; private access regiters need to be correctly initialized anyway, and
+; shifted down to the end of the used registers.
 
 ; GCN-LABEL: {{^}}store_to_undef:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
 
 ; -O0 should assume spilling, so the input scratch resource descriptor
 ; -should be used directly without any copies.
@@ -13,18 +24,30 @@ define void @store_to_undef() #0 {
 }
 
 ; GCN-LABEL: {{^}}store_to_inttoptr:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
 define void @store_to_inttoptr() #0 {
  store volatile i32 0, i32* inttoptr (i32 123 to i32*)
  ret void
 }
 
 ; GCN-LABEL: {{^}}load_from_undef:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
 define void @load_from_undef() #0 {
   %ld = load volatile i32, i32* undef
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_from_inttoptr:
+; OPT-DAG: s_mov_b64 s{{\[}}[[RSRC_LO:[0-9]+]]:{{[0-9]+\]}}, s[0:1]
+; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
+; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
+; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
 define void @load_from_inttoptr() #0 {
   %ld = load volatile i32, i32* inttoptr (i32 123 to i32*)
   ret void

Modified: llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll?rev=285435&r1=285434&r2=285435&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/si-sgpr-spill.ll Fri Oct 28 14:43:31 2016
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
 
 ; These tests check that the compiler won't crash when it needs to spill
 ; SGPRs.

Modified: llvm/trunk/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll?rev=285435&r1=285434&r2=285435&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll Fri Oct 28 14:43:31 2016
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck --check-prefix=TONGA %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck --check-prefix=TONGA %s
 
 ; On Tonga and Iceland, limited SGPR availability means care must be taken to
 ; allocate scratch registers correctly. Check that this test compiles without

Modified: llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll?rev=285435&r1=285434&r2=285435&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-m0.ll Fri Oct 28 14:43:31 2016
@@ -6,6 +6,8 @@
 ; XXX - Why does it like to use vcc?
 
 ; GCN-LABEL: {{^}}spill_m0:
+; TOSMEM: s_mov_b32 s88, SCRATCH_RSRC_DWORD0
+
 ; GCN: s_cmp_lg_u32
 
 ; TOVGPR: s_mov_b32 vcc_hi, m0

Modified: llvm/trunk/test/CodeGen/AMDGPU/wqm.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/wqm.ll?rev=285435&r1=285434&r2=285435&view=diff
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/wqm.ll (original)
+++ llvm/trunk/test/CodeGen/AMDGPU/wqm.ll Fri Oct 28 14:43:31 2016
@@ -459,7 +459,7 @@ entry:
   br i1 %cc, label %if, label %else
 
 if:
-  store volatile <4 x float> %dtex, <4 x float>* undef
+  store volatile <4 x float> %dtex, <4 x float> addrspace(1)* undef
   unreachable
 
 else: