[llvm] r365833 - [AMDGPU] gfx908 agpr spilling

Thu Jul 11 14:54:13 PDT 2019

Author: rampitec
Date: Thu Jul 11 14:54:13 2019
New Revision: 365833

URL: http://llvm.org/viewvc/llvm-project?rev=365833&view=rev
Log:
[AMDGPU] gfx908 agpr spilling

Differential Revision: https://reviews.llvm.org/D64594

Added:
    llvm/trunk/test/CodeGen/AMDGPU/spill-agpr.ll
    llvm/trunk/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
Modified:
    llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
    llvm/trunk/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
    llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h
    llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp

Modified: llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp?rev=365833&r1=365832&r2=365833&view=diff
==============================================================================

--- llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIFrameLowering.cpp Thu Jul 11 14:54:13 2019
@@ -913,7 +913,6 @@ static bool allStackObjectsAreDead(const
   return true;
 }
 
-
 #ifndef NDEBUG
 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
                                  Optional<int> FramePointerSaveIndex) {
@@ -947,6 +946,7 @@ void SIFrameLowering::processFunctionBef
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
 
+  FuncInfo->removeDeadFrameIndices(MFI);
   assert(allSGPRSpillsAreDead(MFI, None) &&
          "SGPR spill should have been removed in SILowerSGPRSpills");
 

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=365833&r1=365832&r2=365833&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Thu Jul 11 14:54:13 2019
@@ -976,6 +976,8 @@ static unsigned getSGPRSpillSaveOpcode(u
     return AMDGPU::SI_SPILL_S256_SAVE;
   case 64:
     return AMDGPU::SI_SPILL_S512_SAVE;
+  case 128:
+    return AMDGPU::SI_SPILL_S1024_SAVE;
   default:
     llvm_unreachable("unknown register size");
   }
@@ -997,6 +999,25 @@ static unsigned getVGPRSpillSaveOpcode(u
     return AMDGPU::SI_SPILL_V256_SAVE;
   case 64:
     return AMDGPU::SI_SPILL_V512_SAVE;
+  case 128:
+    return AMDGPU::SI_SPILL_V1024_SAVE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
+static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_A32_SAVE;
+  case 8:
+    return AMDGPU::SI_SPILL_A64_SAVE;
+  case 16:
+    return AMDGPU::SI_SPILL_A128_SAVE;
+  case 64:
+    return AMDGPU::SI_SPILL_A512_SAVE;
+  case 128:
+    return AMDGPU::SI_SPILL_A1024_SAVE;
   default:
     llvm_unreachable("unknown register size");
   }
@@ -1055,17 +1076,22 @@ void SIInstrInfo::storeRegToStackSlot(Ma
     return;
   }
 
-  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
-
-  unsigned Opcode = getVGPRSpillSaveOpcode(SpillSize);
+  unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillSaveOpcode(SpillSize)
+                                    : getVGPRSpillSaveOpcode(SpillSize);
   MFI->setHasSpilledVGPRs();
-  BuildMI(MBB, MI, DL, get(Opcode))
-    .addReg(SrcReg, getKillRegState(isKill)) // data
-    .addFrameIndex(FrameIndex)               // addr
-    .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
-    .addReg(MFI->getStackPtrOffsetReg())     // scratch_offset
-    .addImm(0)                               // offset
-    .addMemOperand(MMO);
+
+  auto MIB = BuildMI(MBB, MI, DL, get(Opcode));
+  if (RI.hasAGPRs(RC)) {
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    MIB.addReg(Tmp, RegState::Define);
+  }
+  MIB.addReg(SrcReg, getKillRegState(isKill)) // data
+     .addFrameIndex(FrameIndex)               // addr
+     .addReg(MFI->getScratchRSrcReg())        // scratch_rsrc
+     .addReg(MFI->getStackPtrOffsetReg())     // scratch_offset
+     .addImm(0)                               // offset
+     .addMemOperand(MMO);
 }
 
 static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
@@ -1084,6 +1110,8 @@ static unsigned getSGPRSpillRestoreOpcod
     return AMDGPU::SI_SPILL_S256_RESTORE;
   case 64:
     return AMDGPU::SI_SPILL_S512_RESTORE;
+  case 128:
+    return AMDGPU::SI_SPILL_S1024_RESTORE;
   default:
     llvm_unreachable("unknown register size");
   }
@@ -1105,6 +1133,25 @@ static unsigned getVGPRSpillRestoreOpcod
     return AMDGPU::SI_SPILL_V256_RESTORE;
   case 64:
     return AMDGPU::SI_SPILL_V512_RESTORE;
+  case 128:
+    return AMDGPU::SI_SPILL_V1024_RESTORE;
+  default:
+    llvm_unreachable("unknown register size");
+  }
+}
+
+static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
+  switch (Size) {
+  case 4:
+    return AMDGPU::SI_SPILL_A32_RESTORE;
+  case 8:
+    return AMDGPU::SI_SPILL_A64_RESTORE;
+  case 16:
+    return AMDGPU::SI_SPILL_A128_RESTORE;
+  case 64:
+    return AMDGPU::SI_SPILL_A512_RESTORE;
+  case 128:
+    return AMDGPU::SI_SPILL_A1024_RESTORE;
   default:
     llvm_unreachable("unknown register size");
   }
@@ -1156,15 +1203,19 @@ void SIInstrInfo::loadRegFromStackSlot(M
     return;
   }
 
-  assert(RI.hasVGPRs(RC) && "Only VGPR spilling expected");
-
-  unsigned Opcode = getVGPRSpillRestoreOpcode(SpillSize);
-  BuildMI(MBB, MI, DL, get(Opcode), DestReg)
-    .addFrameIndex(FrameIndex)           // vaddr
-    .addReg(MFI->getScratchRSrcReg())    // scratch_rsrc
-    .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
-    .addImm(0)                           // offset
-    .addMemOperand(MMO);
+  unsigned Opcode = RI.hasAGPRs(RC) ? getAGPRSpillRestoreOpcode(SpillSize)
+                                    : getVGPRSpillRestoreOpcode(SpillSize);
+  auto MIB = BuildMI(MBB, MI, DL, get(Opcode), DestReg);
+  if (RI.hasAGPRs(RC)) {
+    MachineRegisterInfo &MRI = MF->getRegInfo();
+    unsigned Tmp = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    MIB.addReg(Tmp, RegState::Define);
+  }
+  MIB.addFrameIndex(FrameIndex)        // vaddr
+     .addReg(MFI->getScratchRSrcReg()) // scratch_rsrc
+     .addReg(MFI->getStackPtrOffsetReg()) // scratch_offset
+     .addImm(0)                           // offset
+     .addMemOperand(MMO);
 }
 
 /// \param @Offset Offset in bytes of the FrameIndex being spilled

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=365833&r1=365832&r2=365833&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Thu Jul 11 14:54:13 2019
@@ -513,6 +513,7 @@ defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg
 defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
 defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
 defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
+defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
 
 multiclass SI_SPILL_VGPR <RegisterClass vgpr_class> {
   let UseNamedOperandTable = 1, VGPRSpill = 1,
@@ -524,7 +525,9 @@ multiclass SI_SPILL_VGPR <RegisterClass
       let mayStore = 1;
       let mayLoad = 0;
       // (2 * 4) + (8 * num_subregs) bytes maximum
-      let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+      // Size field is unsigned char and cannot fit more.
+      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
     }
 
     def _RESTORE : VPseudoInstSI <
@@ -535,7 +538,9 @@ multiclass SI_SPILL_VGPR <RegisterClass
       let mayLoad = 1;
 
       // (2 * 4) + (8 * num_subregs) bytes maximum
-      let Size = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 3), 8);
+      // Size field is unsigned char and cannot fit more.
+      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
     }
   } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
 }
@@ -547,6 +552,44 @@ defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg
 defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
 defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
 defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
+defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
+
+multiclass SI_SPILL_AGPR <RegisterClass vgpr_class> {
+  let UseNamedOperandTable = 1, VGPRSpill = 1,
+      Constraints = "@earlyclobber $tmp",
+      SchedRW = [WriteVMEM] in {
+    def _SAVE : VPseudoInstSI <
+      (outs VGPR_32:$tmp),
+      (ins vgpr_class:$vdata, i32imm:$vaddr, SReg_128:$srsrc,
+           SReg_32:$soffset, i32imm:$offset)> {
+      let mayStore = 1;
+      let mayLoad = 0;
+      // (2 * 4) + (16 * num_subregs) bytes maximum
+      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
+      // Size field is unsigned char and cannot fit more.
+      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
+    }
+
+    def _RESTORE : VPseudoInstSI <
+      (outs vgpr_class:$vdata, VGPR_32:$tmp),
+      (ins i32imm:$vaddr, SReg_128:$srsrc, SReg_32:$soffset,
+           i32imm:$offset)> {
+      let mayStore = 0;
+      let mayLoad = 1;
+
+      // (2 * 4) + (16 * num_subregs) bytes maximum
+      int MaxSize = !add(!shl(!srl(vgpr_class.Size, 5), 4), 8);
+      // Size field is unsigned char and cannot fit more.
+      let Size = !if(!le(MaxSize, 256), MaxSize, 252);
+    }
+  } // End UseNamedOperandTable = 1, VGPRSpill = 1, SchedRW = [WriteVMEM]
+}
+
+defm SI_SPILL_A32  : SI_SPILL_AGPR <AGPR_32>;
+defm SI_SPILL_A64  : SI_SPILL_AGPR <AReg_64>;
+defm SI_SPILL_A128 : SI_SPILL_AGPR <AReg_128>;
+defm SI_SPILL_A512 : SI_SPILL_AGPR <AReg_512>;
+defm SI_SPILL_A1024 : SI_SPILL_AGPR <AReg_1024>;
 
 def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
   (outs SReg_64:$dst),

Modified: llvm/trunk/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SILowerSGPRSpills.cpp?rev=365833&r1=365832&r2=365833&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SILowerSGPRSpills.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SILowerSGPRSpills.cpp Thu Jul 11 14:54:13 2019
@@ -37,6 +37,12 @@ using MBBVector = SmallVector<MachineBas
 
 namespace {
 
+static cl::opt<bool> EnableSpillVGPRToAGPR(
+  "amdgpu-spill-vgpr-to-agpr",
+  cl::desc("Enable spilling VGPRs to AGPRs"),
+  cl::ReallyHidden,
+  cl::init(true));
+
 class SILowerSGPRSpills : public MachineFunctionPass {
 private:
   const SIRegisterInfo *TRI = nullptr;
@@ -242,10 +248,22 @@ bool SILowerSGPRSpills::runOnMachineFunc
     return false;
   }
 
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  bool AllSGPRSpilledToVGPRs = false;
+  const bool SpillVGPRToAGPR = ST.hasMAIInsts() && FuncInfo->hasSpilledVGPRs()
+    && EnableSpillVGPRToAGPR;
+
   bool MadeChange = false;
 
-  if (TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs())) {
+  const bool SpillToAGPR = EnableSpillVGPRToAGPR && ST.hasMAIInsts();
+
+  // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
+  // handled as SpilledToReg in regular PrologEpilogInserter.
+  if ((TRI->spillSGPRToVGPR() && (HasCSRs || FuncInfo->hasSpilledSGPRs())) ||
+      SpillVGPRToAGPR) {
+    AllSGPRSpilledToVGPRs = true;
+
     // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
     // are spilled to VGPRs, in which case we can eliminate the stack usage.
     //
@@ -257,6 +275,18 @@ bool SILowerSGPRSpills::runOnMachineFunc
         MachineInstr &MI = *I;
         Next = std::next(I);
 
+        if (SpillToAGPR && TII->isVGPRSpill(MI)) {
+          // Try to eliminate stack used by VGPR spills before frame
+          // finalization.
+          unsigned FIOp = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                                     AMDGPU::OpName::vaddr);
+          int FI = MI.getOperand(FIOp).getIndex();
+          unsigned VReg = TII->getNamedOperand(MI, AMDGPU::OpName::vdata)
+            ->getReg();
+          if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, TRI->isAGPR(MRI, VReg)))
+            TRI->eliminateFrameIndex(MI, 0, FIOp, nullptr);
+        }
+
         if (!TII->isSGPRSpill(MI))
           continue;
 
@@ -266,18 +296,24 @@ bool SILowerSGPRSpills::runOnMachineFunc
           bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(MI, FI, nullptr);
           (void)Spilled;
           assert(Spilled && "failed to spill SGPR to VGPR when allocated");
-        }
-
+        } else
+          AllSGPRSpilledToVGPRs = false;
       }
     }
 
     for (MachineBasicBlock &MBB : MF) {
       for (auto SSpill : FuncInfo->getSGPRSpillVGPRs())
         MBB.addLiveIn(SSpill.VGPR);
+
+      for (MCPhysReg Reg : FuncInfo->getVGPRSpillAGPRs())
+        MBB.addLiveIn(Reg);
+
+      for (MCPhysReg Reg : FuncInfo->getAGPRSpillVGPRs())
+        MBB.addLiveIn(Reg);
+
       MBB.sortUniqueLiveIns();
     }
 
-    FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
     MadeChange = true;
   }
 

Modified: llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp?rev=365833&r1=365832&r2=365833&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp Thu Jul 11 14:54:13 2019
@@ -319,7 +319,75 @@ bool SIMachineFunctionInfo::allocateSGPR
   return true;
 }
 
-void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
+/// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
+/// Either AGPR is spilled to VGPR to vice versa.
+/// Returns true if a \p FI can be eliminated completely.
+bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
+                                                    int FI,
+                                                    bool isAGPRtoVGPR) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  const GCNSubtarget &ST =  MF.getSubtarget<GCNSubtarget>();
+
+  assert(ST.hasMAIInsts() && FrameInfo.isSpillSlotObjectIndex(FI));
+
+  auto &Spill = VGPRToAGPRSpills[FI];
+
+  // This has already been allocated.
+  if (!Spill.Lanes.empty())
+    return Spill.FullyAllocated;
+
+  unsigned Size = FrameInfo.getObjectSize(FI);
+  unsigned NumLanes = Size / 4;
+  Spill.Lanes.resize(NumLanes, AMDGPU::NoRegister);
+
+  const TargetRegisterClass &RC =
+      isAGPRtoVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::AGPR_32RegClass;
+  auto Regs = RC.getRegisters();
+
+  auto &SpillRegs = isAGPRtoVGPR ? SpillAGPR : SpillVGPR;
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  Spill.FullyAllocated = true;
+
+  // FIXME: Move allocation logic out of MachineFunctionInfo and initialize
+  // once.
+  BitVector OtherUsedRegs;
+  OtherUsedRegs.resize(TRI->getNumRegs());
+
+  const uint32_t *CSRMask =
+      TRI->getCallPreservedMask(MF, MF.getFunction().getCallingConv());
+  if (CSRMask)
+    OtherUsedRegs.setBitsInMask(CSRMask);
+
+  // TODO: Should include register tuples, but doesn't matter with current
+  // usage.
+  for (MCPhysReg Reg : SpillAGPR)
+    OtherUsedRegs.set(Reg);
+  for (MCPhysReg Reg : SpillVGPR)
+    OtherUsedRegs.set(Reg);
+
+  SmallVectorImpl<MCPhysReg>::const_iterator NextSpillReg = Regs.begin();
+  for (unsigned I = 0; I < NumLanes; ++I) {
+    NextSpillReg = std::find_if(
+        NextSpillReg, Regs.end(), [&MRI, &OtherUsedRegs](MCPhysReg Reg) {
+          return MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg) &&
+                 !OtherUsedRegs[Reg];
+        });
+
+    if (NextSpillReg == Regs.end()) { // Registers exhausted
+      Spill.FullyAllocated = false;
+      break;
+    }
+
+    OtherUsedRegs.set(*NextSpillReg);
+    SpillRegs.push_back(*NextSpillReg);
+    Spill.Lanes[I] = *NextSpillReg++;
+  }
+
+  return Spill.FullyAllocated;
+}
+
+void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
   // The FP spill hasn't been inserted yet, so keep it around.
   for (auto &R : SGPRToVGPRSpills) {
     if (R.first != FramePointerSaveIndex)
@@ -332,6 +400,11 @@ void SIMachineFunctionInfo::removeSGPRTo
        ++i)
     if (i != FramePointerSaveIndex)
       MFI.setStackID(i, TargetStackID::Default);
+
+  for (auto &R : VGPRToAGPRSpills) {
+    if (R.second.FullyAllocated)
+      MFI.RemoveStackObject(R.first);
+  }
 }
 
 MCPhysReg SIMachineFunctionInfo::getNextUserSGPR() const {

Modified: llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h?rev=365833&r1=365832&r2=365833&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIMachineFunctionInfo.h Thu Jul 11 14:54:13 2019
@@ -442,6 +442,11 @@ public:
     SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {}
   };
 
+  struct VGPRSpillToAGPR {
+    SmallVector<MCPhysReg, 32> Lanes;
+    bool FullyAllocated = false;
+  };
+
   SparseBitVector<> WWMReservedRegs;
 
   void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); }
@@ -456,6 +461,14 @@ private:
   unsigned NumVGPRSpillLanes = 0;
   SmallVector<SGPRSpillVGPRCSR, 2> SpillVGPRs;
 
+  DenseMap<int, VGPRSpillToAGPR> VGPRToAGPRSpills;
+
+  // AGPRs used for VGPR spills.
+  SmallVector<MCPhysReg, 32> SpillAGPR;
+
+  // VGPRs used for AGPR spills.
+  SmallVector<MCPhysReg, 32> SpillVGPR;
+
 public: // FIXME
   /// If this is set, an SGPR used for save/restore of the register used for the
   /// frame pointer.
@@ -477,6 +490,20 @@ public:
     return SpillVGPRs;
   }
 
+  ArrayRef<MCPhysReg> getAGPRSpillVGPRs() const {
+    return SpillAGPR;
+  }
+
+  ArrayRef<MCPhysReg> getVGPRSpillAGPRs() const {
+    return SpillVGPR;
+  }
+
+  MCPhysReg getVGPRToAGPRSpill(int FrameIndex, unsigned Lane) const {
+    auto I = VGPRToAGPRSpills.find(FrameIndex);
+    return (I == VGPRToAGPRSpills.end()) ? (MCPhysReg)AMDGPU::NoRegister
+                                         : I->second.Lanes[Lane];
+  }
+
   AMDGPU::SIModeRegisterDefaults getMode() const {
     return Mode;
   }
@@ -484,7 +511,8 @@ public:
   bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
                                  unsigned NumLane) const;
   bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
-  void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
+  bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
+  void removeDeadFrameIndices(MachineFrameInfo &MFI);
 
   bool hasCalculatedTID() const { return TIDReg != 0; };
   unsigned getTIDReg() const { return TIDReg; };

Modified: llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp?rev=365833&r1=365832&r2=365833&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIRegisterInfo.cpp Thu Jul 11 14:54:13 2019
@@ -256,6 +256,13 @@ BitVector SIRegisterInfo::getReservedReg
     reserveRegisterTuples(Reserved, Reg);
   }
 
+  // FIXME: Stop using reserved registers for this.
+  for (MCPhysReg Reg : MFI->getAGPRSpillVGPRs())
+    reserveRegisterTuples(Reserved, Reg);
+
+  for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
+    reserveRegisterTuples(Reserved, Reg);
+
   return Reserved;
 }
 
@@ -448,10 +455,19 @@ const TargetRegisterClass *SIRegisterInf
 static unsigned getNumSubRegsForSpillOp(unsigned Op) {
 
   switch (Op) {
+  case AMDGPU::SI_SPILL_S1024_SAVE:
+  case AMDGPU::SI_SPILL_S1024_RESTORE:
+  case AMDGPU::SI_SPILL_V1024_SAVE:
+  case AMDGPU::SI_SPILL_V1024_RESTORE:
+  case AMDGPU::SI_SPILL_A1024_SAVE:
+  case AMDGPU::SI_SPILL_A1024_RESTORE:
+    return 32;
   case AMDGPU::SI_SPILL_S512_SAVE:
   case AMDGPU::SI_SPILL_S512_RESTORE:
   case AMDGPU::SI_SPILL_V512_SAVE:
   case AMDGPU::SI_SPILL_V512_RESTORE:
+  case AMDGPU::SI_SPILL_A512_SAVE:
+  case AMDGPU::SI_SPILL_A512_RESTORE:
     return 16;
   case AMDGPU::SI_SPILL_S256_SAVE:
   case AMDGPU::SI_SPILL_S256_RESTORE:
@@ -467,6 +483,8 @@ static unsigned getNumSubRegsForSpillOp(
   case AMDGPU::SI_SPILL_S128_RESTORE:
   case AMDGPU::SI_SPILL_V128_SAVE:
   case AMDGPU::SI_SPILL_V128_RESTORE:
+  case AMDGPU::SI_SPILL_A128_SAVE:
+  case AMDGPU::SI_SPILL_A128_RESTORE:
     return 4;
   case AMDGPU::SI_SPILL_S96_SAVE:
   case AMDGPU::SI_SPILL_S96_RESTORE:
@@ -477,11 +495,15 @@ static unsigned getNumSubRegsForSpillOp(
   case AMDGPU::SI_SPILL_S64_RESTORE:
   case AMDGPU::SI_SPILL_V64_SAVE:
   case AMDGPU::SI_SPILL_V64_RESTORE:
+  case AMDGPU::SI_SPILL_A64_SAVE:
+  case AMDGPU::SI_SPILL_A64_RESTORE:
     return 2;
   case AMDGPU::SI_SPILL_S32_SAVE:
   case AMDGPU::SI_SPILL_S32_RESTORE:
   case AMDGPU::SI_SPILL_V32_SAVE:
   case AMDGPU::SI_SPILL_V32_RESTORE:
+  case AMDGPU::SI_SPILL_A32_SAVE:
+  case AMDGPU::SI_SPILL_A32_RESTORE:
     return 1;
   default: llvm_unreachable("Invalid spill opcode");
   }
@@ -541,6 +563,35 @@ static int getOffsetMUBUFLoad(unsigned O
   }
 }
 
+static MachineInstrBuilder spillVGPRtoAGPR(MachineBasicBlock::iterator MI,
+                                           int Index,
+                                           unsigned Lane,
+                                           unsigned ValueReg,
+                                           bool IsKill) {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction *MF = MI->getParent()->getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  const GCNSubtarget &ST =  MF->getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  MCPhysReg Reg = MFI->getVGPRToAGPRSpill(Index, Lane);
+
+  if (Reg == AMDGPU::NoRegister)
+    return MachineInstrBuilder();
+
+  bool IsStore = MI->mayStore();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  auto *TRI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+
+  unsigned Dst = IsStore ? Reg : ValueReg;
+  unsigned Src = IsStore ? ValueReg : Reg;
+  unsigned Opc = (IsStore ^ TRI->isVGPR(MRI, Reg)) ? AMDGPU::V_ACCVGPR_WRITE_B32
+                                                   : AMDGPU::V_ACCVGPR_READ_B32;
+
+  return BuildMI(*MBB, MI, MI->getDebugLoc(), TII->get(Opc), Dst)
+           .addReg(Src, getKillRegState(IsKill));
+}
+
 // This differs from buildSpillLoadStore by only scavenging a VGPR. It does not
 // need to handle the case where an SGPR may need to be spilled while spilling.
 static bool buildMUBUFOffsetLoadStore(const SIInstrInfo *TII,
@@ -559,6 +610,9 @@ static bool buildMUBUFOffsetLoadStore(co
     return false;
 
   const MachineOperand *Reg = TII->getNamedOperand(*MI, AMDGPU::OpName::vdata);
+  if (spillVGPRtoAGPR(MI, Index, 0, Reg->getReg(), false).getInstr())
+    return true;
+
   MachineInstrBuilder NewMI =
       BuildMI(*MBB, MI, DL, TII->get(LoadStoreOp))
           .add(*Reg)
@@ -611,6 +665,10 @@ void SIRegisterInfo::buildSpillLoadStore
   unsigned Align = MFI.getObjectAlignment(Index);
   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
 
+  Register TmpReg =
+    hasAGPRs(RC) ? TII->getNamedOperand(*MI, AMDGPU::OpName::tmp)->getReg()
+                 : Register();
+
   assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
 
   if (!isUInt<12>(Offset + Size - EltSize)) {
@@ -659,21 +717,38 @@ void SIRegisterInfo::buildSpillLoadStore
       SrcDstRegState |= getKillRegState(IsKill);
     }
 
-    MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
-    MachineMemOperand *NewMMO
-      = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
-                                 EltSize, MinAlign(Align, EltSize * i));
-
-    auto MIB = BuildMI(*MBB, MI, DL, Desc)
-      .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
-      .addReg(ScratchRsrcReg)
-      .addReg(SOffset, SOffsetRegState)
-      .addImm(Offset)
-      .addImm(0) // glc
-      .addImm(0) // slc
-      .addImm(0) // tfe
-      .addImm(0) // dlc
-      .addMemOperand(NewMMO);
+    auto MIB = spillVGPRtoAGPR(MI, Index, i, SubReg, IsKill);
+
+    if (!MIB.getInstr()) {
+      unsigned FinalReg = SubReg;
+      if (TmpReg != AMDGPU::NoRegister) {
+        if (IsStore)
+          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_READ_B32), TmpReg)
+            .addReg(SubReg, getKillRegState(IsKill));
+        SubReg = TmpReg;
+      }
+
+      MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
+      MachineMemOperand *NewMMO
+        = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
+                                   EltSize, MinAlign(Align, EltSize * i));
+
+      MIB = BuildMI(*MBB, MI, DL, Desc)
+        .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
+        .addReg(ScratchRsrcReg)
+        .addReg(SOffset, SOffsetRegState)
+        .addImm(Offset)
+        .addImm(0) // glc
+        .addImm(0) // slc
+        .addImm(0) // tfe
+        .addImm(0) // dlc
+        .addMemOperand(NewMMO);
+
+      if (!IsStore && TmpReg != AMDGPU::NoRegister)
+        MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
+                      FinalReg)
+          .addReg(TmpReg, RegState::Kill);
+    }
 
     if (NumSubRegs > 1)
       MIB.addReg(ValueReg, RegState::Implicit | SrcDstRegState);
@@ -1038,6 +1113,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPR
   int FI,
   RegScavenger *RS) const {
   switch (MI->getOpcode()) {
+  case AMDGPU::SI_SPILL_S1024_SAVE:
   case AMDGPU::SI_SPILL_S512_SAVE:
   case AMDGPU::SI_SPILL_S256_SAVE:
   case AMDGPU::SI_SPILL_S160_SAVE:
@@ -1046,6 +1122,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPR
   case AMDGPU::SI_SPILL_S64_SAVE:
   case AMDGPU::SI_SPILL_S32_SAVE:
     return spillSGPR(MI, FI, RS, true);
+  case AMDGPU::SI_SPILL_S1024_RESTORE:
   case AMDGPU::SI_SPILL_S512_RESTORE:
   case AMDGPU::SI_SPILL_S256_RESTORE:
   case AMDGPU::SI_SPILL_S160_RESTORE:
@@ -1080,6 +1157,7 @@ void SIRegisterInfo::eliminateFrameIndex
 
   switch (MI->getOpcode()) {
     // SGPR register spill
+    case AMDGPU::SI_SPILL_S1024_SAVE:
     case AMDGPU::SI_SPILL_S512_SAVE:
     case AMDGPU::SI_SPILL_S256_SAVE:
     case AMDGPU::SI_SPILL_S160_SAVE:
@@ -1092,6 +1170,7 @@ void SIRegisterInfo::eliminateFrameIndex
     }
 
     // SGPR register restore
+    case AMDGPU::SI_SPILL_S1024_RESTORE:
     case AMDGPU::SI_SPILL_S512_RESTORE:
     case AMDGPU::SI_SPILL_S256_RESTORE:
     case AMDGPU::SI_SPILL_S160_RESTORE:
@@ -1104,13 +1183,19 @@ void SIRegisterInfo::eliminateFrameIndex
     }
 
     // VGPR register spill
+    case AMDGPU::SI_SPILL_V1024_SAVE:
     case AMDGPU::SI_SPILL_V512_SAVE:
     case AMDGPU::SI_SPILL_V256_SAVE:
     case AMDGPU::SI_SPILL_V160_SAVE:
     case AMDGPU::SI_SPILL_V128_SAVE:
     case AMDGPU::SI_SPILL_V96_SAVE:
     case AMDGPU::SI_SPILL_V64_SAVE:
-    case AMDGPU::SI_SPILL_V32_SAVE: {
+    case AMDGPU::SI_SPILL_V32_SAVE:
+    case AMDGPU::SI_SPILL_A1024_SAVE:
+    case AMDGPU::SI_SPILL_A512_SAVE:
+    case AMDGPU::SI_SPILL_A128_SAVE:
+    case AMDGPU::SI_SPILL_A64_SAVE:
+    case AMDGPU::SI_SPILL_A32_SAVE: {
       const MachineOperand *VData = TII->getNamedOperand(*MI,
                                                          AMDGPU::OpName::vdata);
       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
@@ -1134,7 +1219,13 @@ void SIRegisterInfo::eliminateFrameIndex
     case AMDGPU::SI_SPILL_V128_RESTORE:
     case AMDGPU::SI_SPILL_V160_RESTORE:
     case AMDGPU::SI_SPILL_V256_RESTORE:
-    case AMDGPU::SI_SPILL_V512_RESTORE: {
+    case AMDGPU::SI_SPILL_V512_RESTORE:
+    case AMDGPU::SI_SPILL_V1024_RESTORE:
+    case AMDGPU::SI_SPILL_A32_RESTORE:
+    case AMDGPU::SI_SPILL_A64_RESTORE:
+    case AMDGPU::SI_SPILL_A128_RESTORE:
+    case AMDGPU::SI_SPILL_A512_RESTORE:
+    case AMDGPU::SI_SPILL_A1024_RESTORE: {
       const MachineOperand *VData = TII->getNamedOperand(*MI,
                                                          AMDGPU::OpName::vdata);
       assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==

Added: llvm/trunk/test/CodeGen/AMDGPU/spill-agpr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/spill-agpr.ll?rev=365833&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-agpr.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-agpr.ll Thu Jul 11 14:54:13 2019
@@ -0,0 +1,108 @@
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2V %s
+; RUN: llc -march=amdgcn -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=0 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,A2M %s
+
+; GCN-LABEL: {{^}}max_24regs_32a_used:
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; A2V-NOT:    SCRATCH_RSRC
+; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0
+; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
+; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
+; A2V:        ScratchSize: 0
+define amdgpu_kernel void @max_24regs_32a_used(<16 x float> addrspace(1)* %arg, float addrspace(1)* %out) #0 {
+bb:
+  %in.1 = load <16 x float>, <16 x float> addrspace(1)* %arg
+  %mai.1 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float 1.0, float 1.0, <16 x float> %mai.1, i32 0, i32 0, i32 0)
+  %elt1 = extractelement <16 x float> %mai.2, i32 0
+  %elt2 = extractelement <16 x float> %mai.1, i32 15
+  %elt3 = extractelement <16 x float> %mai.1, i32 14
+  %elt4 = extractelement <16 x float> %mai.2, i32 1
+  store float %elt1, float addrspace(1)* %out
+  %gep1 = getelementptr float, float addrspace(1)* %out, i64 1
+  store float %elt2, float addrspace(1)* %gep1
+  %gep2 = getelementptr float, float addrspace(1)* %out, i64 2
+  store float %elt3, float addrspace(1)* %gep2
+  %gep3 = getelementptr float, float addrspace(1)* %out, i64 3
+  store float %elt4, float addrspace(1)* %gep3
+
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_12regs_13a_used:
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; A2V-NOT:    SCRATCH_RSRC
+; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a4
+; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
+; A2V:        v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
+; A2V:        ScratchSize: 0
+define amdgpu_kernel void @max_12regs_13a_used(<4 x float> addrspace(1)* %arg, <4 x float> addrspace(1)* %out) #2 {
+bb:
+  %in.1 = load <4 x float>, <4 x float> addrspace(1)* %arg
+  %mai.1 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %in.1, i32 0, i32 0, i32 0)
+  %mai.2 = tail call <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float 1.0, float 1.0, <4 x float> %mai.1, i32 0, i32 0, i32 0)
+  br label %use
+
+use:
+  call void asm sideeffect "", "a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5)
+  store <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> addrspace(1)* %out
+  br label %st
+
+st:
+  %gep1 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 16
+  %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i64 32
+  store <4 x float> %mai.1, <4 x float> addrspace(1)* %gep1
+  store <4 x float> %mai.2, <4 x float> addrspace(1)* %gep2
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; A2V-NOT:    SCRATCH_RSRC
+; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0
+; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
+; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
+; A2V:        ScratchSize: 0
+define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  call void asm sideeffect "", "a,a,a,a"(i32 1, i32 2, i32 3, i32 4)
+  call void asm sideeffect "", "a,a,a,a,a"(i32 5, i32 6, i32 7, i32 8, i32 9)
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_32regs_mfma32:
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; A2M-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; A2V-NOT:    SCRATCH_RSRC
+; GFX908-DAG: v_accvgpr_read_b32 v[[VSPILL:[0-9]+]], a0
+; A2M:        buffer_store_dword v[[VSPILL]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI:[0-9]+]] ; 4-byte Folded Spill
+; A2M:        buffer_load_dword v[[VSPILL:[0-9]+]], off, s[{{[0-9:]+}}], s{{[0-9]+}} offset:[[FI]] ; 4-byte Folded Reload
+; GFX908:     v_accvgpr_write_b32 a{{[0-9]+}}, v[[VSPILL]]
+; A2V:        ScratchSize: 0
+define amdgpu_kernel void @max_32regs_mfma32(i32 addrspace(1)* %arg) #3 {
+bb:
+  %v = call i32 asm sideeffect "", "=a"()
+  br label %use
+
+use:
+  %mai.1 = tail call <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 1.0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 2>, i32 0, i32 0, i32 0)
+  call void asm sideeffect "", "a"(i32 %v)
+  %elt1 = extractelement <32 x i32> %mai.1, i32 0
+  store i32 %elt1, i32 addrspace(1)* %arg
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+declare <16 x float> @llvm.amdgcn.mfma.f32.16x16x1f32(float, float, <16 x float>, i32, i32, i32)
+declare <4 x float> @llvm.amdgcn.mfma.f32.4x4x1f32(float, float, <4 x float>, i32, i32, i32)
+declare <32 x i32> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x i32>, i32, i32, i32)
+
+attributes #0 = { nounwind "amdgpu-num-vgpr"="24" }
+attributes #1 = { nounwind "amdgpu-num-vgpr"="8" }
+attributes #2 = { nounwind "amdgpu-num-vgpr"="12" }
+attributes #3 = { nounwind "amdgpu-num-vgpr"="32" }

Added: llvm/trunk/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll?rev=365833&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll Thu Jul 11 14:54:13 2019
@@ -0,0 +1,288 @@
+; RUN: llc -march=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX908 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX900 %s
+
+; GCN-LABEL: {{^}}max_10_vgprs:
+; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GFX908-NOT: SCRATCH_RSRC
+; GFX908-DAG: v_accvgpr_write_b32 a0, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}}
+; GFX900:     buffer_store_dword v{{[0-9]}},
+; GFX900:     buffer_store_dword v{{[0-9]}},
+; GFX900:     buffer_load_dword v{{[0-9]}},
+; GFX900:     buffer_load_dword v{{[0-9]}},
+; GFX908-NOT: buffer_
+; GFX908-DAG  v_accvgpr_read_b32 v{{[0-9]}}, a0
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a1
+
+; GCN:    NumVgprs: 10
+; GFX900: ScratchSize: 12
+; GFX908: ScratchSize: 0
+; GCN:    VGPRBlocks: 2
+; GCN:    NumVGPRsForWavesPerEU: 10
+define amdgpu_kernel void @max_10_vgprs(i32 addrspace(1)* %p) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid
+  %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4
+  %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8
+  %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12
+  %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16
+  %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20
+  %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24
+  %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28
+  %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32
+  %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36
+  %v1 = load volatile i32, i32 addrspace(1)* %p1
+  %v2 = load volatile i32, i32 addrspace(1)* %p2
+  %v3 = load volatile i32, i32 addrspace(1)* %p3
+  %v4 = load volatile i32, i32 addrspace(1)* %p4
+  %v5 = load volatile i32, i32 addrspace(1)* %p5
+  %v6 = load volatile i32, i32 addrspace(1)* %p6
+  %v7 = load volatile i32, i32 addrspace(1)* %p7
+  %v8 = load volatile i32, i32 addrspace(1)* %p8
+  %v9 = load volatile i32, i32 addrspace(1)* %p9
+  %v10 = load volatile i32, i32 addrspace(1)* %p10
+  call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10)
+  store volatile i32 %v1, i32 addrspace(1)* undef
+  store volatile i32 %v2, i32 addrspace(1)* undef
+  store volatile i32 %v3, i32 addrspace(1)* undef
+  store volatile i32 %v4, i32 addrspace(1)* undef
+  store volatile i32 %v5, i32 addrspace(1)* undef
+  store volatile i32 %v6, i32 addrspace(1)* undef
+  store volatile i32 %v7, i32 addrspace(1)* undef
+  store volatile i32 %v8, i32 addrspace(1)* undef
+  store volatile i32 %v9, i32 addrspace(1)* undef
+  store volatile i32 %v10, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_10_vgprs_used_9a:
+; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GFX908:     v_accvgpr_write_b32 a9, v{{[0-9]}}
+; GCN:        buffer_store_dword v{{[0-9]}},
+; GFX900:     buffer_store_dword v{{[0-9]}},
+; GFX900:     buffer_load_dword v{{[0-9]}},
+; GFX900:     buffer_load_dword v{{[0-9]}},
+; GFX908-NOT: buffer_
+; GFX908:     v_accvgpr_read_b32 v{{[0-9]}}, a9
+; GFX908:     buffer_load_dword v{{[0-9]}},
+; GFX908-NOT: buffer_
+
+; GCN:    NumVgprs: 10
+; GFX900: ScratchSize: 12
+; GFX908: ScratchSize: 8
+; GCN:    VGPRBlocks: 2
+; GCN:    NumVGPRsForWavesPerEU: 10
+define amdgpu_kernel void @max_10_vgprs_used_9a(i32 addrspace(1)* %p) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  call void asm sideeffect "", "a,a,a,a,a,a,a,a,a"(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9)
+  %p1 = getelementptr inbounds i32, i32 addrspace(1)* %p, i32 %tid
+  %p2 = getelementptr inbounds i32, i32 addrspace(1)* %p1, i32 4
+  %p3 = getelementptr inbounds i32, i32 addrspace(1)* %p2, i32 8
+  %p4 = getelementptr inbounds i32, i32 addrspace(1)* %p3, i32 12
+  %p5 = getelementptr inbounds i32, i32 addrspace(1)* %p4, i32 16
+  %p6 = getelementptr inbounds i32, i32 addrspace(1)* %p5, i32 20
+  %p7 = getelementptr inbounds i32, i32 addrspace(1)* %p6, i32 24
+  %p8 = getelementptr inbounds i32, i32 addrspace(1)* %p7, i32 28
+  %p9 = getelementptr inbounds i32, i32 addrspace(1)* %p8, i32 32
+  %p10 = getelementptr inbounds i32, i32 addrspace(1)* %p9, i32 36
+  %v1 = load volatile i32, i32 addrspace(1)* %p1
+  %v2 = load volatile i32, i32 addrspace(1)* %p2
+  %v3 = load volatile i32, i32 addrspace(1)* %p3
+  %v4 = load volatile i32, i32 addrspace(1)* %p4
+  %v5 = load volatile i32, i32 addrspace(1)* %p5
+  %v6 = load volatile i32, i32 addrspace(1)* %p6
+  %v7 = load volatile i32, i32 addrspace(1)* %p7
+  %v8 = load volatile i32, i32 addrspace(1)* %p8
+  %v9 = load volatile i32, i32 addrspace(1)* %p9
+  %v10 = load volatile i32, i32 addrspace(1)* %p10
+  call void asm sideeffect "", "v,v,v,v,v,v,v,v,v,v"(i32 %v1, i32 %v2, i32 %v3, i32 %v4, i32 %v5, i32 %v6, i32 %v7, i32 %v8, i32 %v9, i32 %v10)
+  store volatile i32 %v1, i32 addrspace(1)* undef
+  store volatile i32 %v2, i32 addrspace(1)* undef
+  store volatile i32 %v3, i32 addrspace(1)* undef
+  store volatile i32 %v4, i32 addrspace(1)* undef
+  store volatile i32 %v5, i32 addrspace(1)* undef
+  store volatile i32 %v6, i32 addrspace(1)* undef
+  store volatile i32 %v7, i32 addrspace(1)* undef
+  store volatile i32 %v8, i32 addrspace(1)* undef
+  store volatile i32 %v9, i32 addrspace(1)* undef
+  store volatile i32 %v10, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_10_vgprs_used_1a_partial_spill:
+; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GFX908-DAG: v_accvgpr_write_b32 a0, 1
+; GFX908-DAG: v_accvgpr_write_b32 a1, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a2, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a3, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a4, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a5, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a6, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a7, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a8, v{{[0-9]}}
+; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
+; GFX900:     buffer_store_dword v{{[0-9]}},
+; GCN-DAG:    buffer_store_dword v{{[0-9]}},
+; GFX900:     buffer_load_dword v{{[0-9]}},
+; GCN-DAG:    buffer_load_dword v{{[0-9]}},
+; GFX908-DAG  v_accvgpr_read_b32 v{{[0-9]}}, a1
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a2
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a3
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a4
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a5
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a6
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a7
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a8
+; GFX908-DAG: v_accvgpr_read_b32 v{{[0-9]}}, a9
+
+; GCN:    NumVgprs: 10
+; GFX900: ScratchSize: 44
+; GFX908: ScratchSize: 20
+; GCN:    VGPRBlocks: 2
+; GCN:    NumVGPRsForWavesPerEU: 10
+define amdgpu_kernel void @max_10_vgprs_used_1a_partial_spill(i64 addrspace(1)* %p) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  call void asm sideeffect "", "a"(i32 1)
+  %p1 = getelementptr inbounds i64, i64 addrspace(1)* %p, i32 %tid
+  %p2 = getelementptr inbounds i64, i64 addrspace(1)* %p1, i32 8
+  %p3 = getelementptr inbounds i64, i64 addrspace(1)* %p2, i32 16
+  %p4 = getelementptr inbounds i64, i64 addrspace(1)* %p3, i32 24
+  %p5 = getelementptr inbounds i64, i64 addrspace(1)* %p4, i32 32
+  %v1 = load volatile i64, i64 addrspace(1)* %p1
+  %v2 = load volatile i64, i64 addrspace(1)* %p2
+  %v3 = load volatile i64, i64 addrspace(1)* %p3
+  %v4 = load volatile i64, i64 addrspace(1)* %p4
+  %v5 = load volatile i64, i64 addrspace(1)* %p5
+  call void asm sideeffect "", "v,v,v,v,v"(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5)
+  store volatile i64 %v1, i64 addrspace(1)* %p2
+  store volatile i64 %v2, i64 addrspace(1)* %p3
+  store volatile i64 %v3, i64 addrspace(1)* %p4
+  store volatile i64 %v4, i64 addrspace(1)* %p5
+  store volatile i64 %v5, i64 addrspace(1)* %p1
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_10_vgprs_spill_v32:
+; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GCN-DAG:    s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GFX908-DAG: v_accvgpr_write_b32 a0,
+; GFX908-DAG: v_accvgpr_write_b32 a9, v{{[0-9]}}
+; GCN-NOT:    a10
+; GCN:        buffer_store_dword v{{[0-9]}},
+
+; GFX908: NumVgprs: 10
+; GFX900: ScratchSize: 100
+; GFX908: ScratchSize: 68
+; GFX908: VGPRBlocks: 2
+; GFX908: NumVGPRsForWavesPerEU: 10
+define amdgpu_kernel void @max_10_vgprs_spill_v32(<32 x float> addrspace(1)* %p) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
+  %v = load volatile <32 x float>, <32 x float> addrspace(1)* %gep
+  store volatile <32 x float> %v, <32 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32:
+; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GFX908-NOT: SCRATCH_RSRC
+; GFX908-DAG: v_accvgpr_write_b32 a0, v
+; GFX900:     buffer_store_dword v
+; GFX900:     buffer_load_dword v
+; GFX908-NOT: buffer_
+; GFX908-DAG  v_accvgpr_read_b32
+
+; GCN:    NumVgprs: 256
+; GFX900: ScratchSize: 148
+; GFX908: ScratchSize: 0
+; GCN:    VGPRBlocks: 63
+; GCN:    NumVGPRsForWavesPerEU: 256
+define amdgpu_kernel void @max_256_vgprs_spill_9x32(<32 x float> addrspace(1)* %p) {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
+  %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
+  %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid
+  %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid
+  %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid
+  %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid
+  %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid
+  %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid
+  %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid
+  %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1
+  %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2
+  %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3
+  %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4
+  %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5
+  %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6
+  %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7
+  %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8
+  %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9
+  store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}max_256_vgprs_spill_9x32_2bb:
+; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD0
+; GFX900-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GFX908-NOT: SCRATCH_RSRC
+; GFX908-DAG: v_accvgpr_write_b32 a0, v
+; GFX900:     buffer_store_dword v
+; GFX900:     buffer_load_dword v
+; GFX908-NOT: buffer_
+; GFX908-DAG  v_accvgpr_read_b32
+
+; GCN:    NumVgprs: 256
+; GFX900: ScratchSize: 580
+; GFX908: ScratchSize: 0
+; GCN:    VGPRBlocks: 63
+; GCN:    NumVGPRsForWavesPerEU: 256
+define amdgpu_kernel void @max_256_vgprs_spill_9x32_2bb(<32 x float> addrspace(1)* %p) {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %p1 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p, i32 %tid
+  %p2 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p1, i32 %tid
+  %p3 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p2, i32 %tid
+  %p4 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p3, i32 %tid
+  %p5 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p4, i32 %tid
+  %p6 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p5, i32 %tid
+  %p7 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p6, i32 %tid
+  %p8 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p7, i32 %tid
+  %p9 = getelementptr inbounds <32 x float>, <32 x float> addrspace(1)* %p8, i32 %tid
+  %v1 = load volatile <32 x float>, <32 x float> addrspace(1)* %p1
+  %v2 = load volatile <32 x float>, <32 x float> addrspace(1)* %p2
+  %v3 = load volatile <32 x float>, <32 x float> addrspace(1)* %p3
+  %v4 = load volatile <32 x float>, <32 x float> addrspace(1)* %p4
+  %v5 = load volatile <32 x float>, <32 x float> addrspace(1)* %p5
+  %v6 = load volatile <32 x float>, <32 x float> addrspace(1)* %p6
+  %v7 = load volatile <32 x float>, <32 x float> addrspace(1)* %p7
+  %v8 = load volatile <32 x float>, <32 x float> addrspace(1)* %p8
+  %v9 = load volatile <32 x float>, <32 x float> addrspace(1)* %p9
+  br label %st
+
+st:
+  store volatile <32 x float> %v1, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v2, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v3, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v4, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v5, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v6, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v7, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v8, <32 x float> addrspace(1)* undef
+  store volatile <32 x float> %v9, <32 x float> addrspace(1)* undef
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x()
+
+attributes #0 = { nounwind "amdgpu-num-vgpr"="10" }