[llvm] d9e0a29 - AMDGPU: Disallow spill folding with m0 copies

Wed Oct 30 14:56:43 PDT 2019

Author: Matt Arsenault
Date: 2019-10-30T14:56:33-07:00
New Revision: d9e0a2942ac71327166a3a597e8383192fd19b17

URL: https://github.com/llvm/llvm-project/commit/d9e0a2942ac71327166a3a597e8383192fd19b17
DIFF: https://github.com/llvm/llvm-project/commit/d9e0a2942ac71327166a3a597e8383192fd19b17.diff

LOG: AMDGPU: Disallow spill folding with m0 copies

readlane and writelane instructions are not allowed to use m0 as the
data operand, so spilling them is tricky and would require an
intermediate SGPR to spill it. Constrain the virtual register class in
this caes to disallow the inline spiller from folding the m0 operand
directly into the spill instruction.

I copied this hack from AArch64 which has the same problem for $sp.

Added: 
    llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir

Modified: 
    llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/lib/Target/AMDGPU/SIInstrInfo.h

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 272a7fc442d8..fee2d728c7eb 100644

--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -1062,6 +1062,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
   if (RI.isSGPRClass(RC)) {
     MFI->setHasSpilledSGPRs();
+    assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
 
     // We are only allowed to create one new instruction when spilling
     // registers, so we need to use pseudo instruction for spilling SGPRs.
@@ -1190,6 +1191,7 @@ void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 
   if (RI.isSGPRClass(RC)) {
     MFI->setHasSpilledSGPRs();
+    assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
 
     // FIXME: Maybe this should not include a memoperand because it will be
     // lowered to non-memory instructions.
@@ -6558,3 +6560,36 @@ MachineInstr *SIInstrInfo::createPHISourceCopy(
 }
 
 bool llvm::SIInstrInfo::isWave32() const { return ST.isWave32(); }
+
+MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
+    MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops,
+    MachineBasicBlock::iterator InsertPt, int FrameIndex, LiveIntervals *LIS,
+    VirtRegMap *VRM) const {
+  // This is a bit of a hack (copied from AArch64). Consider this instruction:
+  //
+  //   %0:sreg_32 = COPY $m0
+  //
+  // We explicitly chose SReg_32 for the virtual register so such a copy might
+  // be eliminated by RegisterCoalescer. However, that may not be possible, and
+  // %0 may even spill. We can't spill $m0 normally (it would require copying to
+  // a numbered SGPR anyway), and since it is in the SReg_32 register class,
+  // TargetInstrInfo::foldMemoryOperand() is going to try.
+  //
+  // To prevent that, constrain the %0 register class here.
+  if (MI.isFullCopy()) {
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
+
+    if (DstReg == AMDGPU::M0 && SrcReg.isVirtual()) {
+      MF.getRegInfo().constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
+      return nullptr;
+    }
+
+    if (SrcReg == AMDGPU::M0 && DstReg.isVirtual()) {
+      MF.getRegInfo().constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
+      return nullptr;
+    }
+  }
+
+  return nullptr;
+}

diff  --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index be463442c888..30c2f74e2a6b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -1027,6 +1027,13 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
   }
 
   void fixImplicitOperands(MachineInstr &MI) const;
+
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
+                                      ArrayRef<unsigned> Ops,
+                                      MachineBasicBlock::iterator InsertPt,
+                                      int FrameIndex,
+                                      LiveIntervals *LIS = nullptr,
+                                      VirtRegMap *VRM = nullptr) const override;
 };
 
 /// \brief Returns true if a reg:subreg pair P has a TRC class

diff  --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir
new file mode 100644
index 000000000000..a4dab1958dd0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir
@@ -0,0 +1,58 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs -stress-regalloc=2 -start-before=greedy -stop-after=virtregmap -o - %s | FileCheck %s
+
+# Test that a spill of a copy of m0 is not folded to be a spill of m0 directly.
+
+---
+
+name:            merge_sgpr_spill_into_copy_from_m0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0
+    ; CHECK: liveins: $vgpr0
+    ; CHECK: S_WAITCNT 0
+    ; CHECK: S_NOP 0, implicit-def $m0
+    ; CHECK: $sgpr0 = S_MOV_B32 $m0
+    ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr0, 0, undef $vgpr0
+    ; CHECK: $sgpr0 = V_READLANE_B32_vi $vgpr0, 0
+    ; CHECK: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0
+    ; CHECK: $sgpr0 = V_READLANE_B32_vi killed $vgpr0, 0
+    ; CHECK: $m0 = S_MOV_B32 killed $sgpr0
+    ; CHECK: S_NOP 0
+    ; CHECK: S_SENDMSG 0, implicit $m0, implicit $exec
+    S_NOP 0, implicit-def $m0
+    %0:sreg_32 = COPY $m0
+    S_NOP 0, implicit-def %1:sreg_32, implicit-def %2:sreg_32, implicit %0
+    $m0 = COPY %0
+    S_SENDMSG 0, implicit $m0, implicit $exec
+
+...
+
+# Test that a reload into a copy of m0 is not folded to be a reload of m0 directly.
+
+---
+
+name:            reload_sgpr_spill_into_copy_to_m0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+
+    ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0
+    ; CHECK: liveins: $vgpr0
+    ; CHECK: S_WAITCNT 0
+    ; CHECK: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0
+    ; CHECK: $vgpr0 = V_WRITELANE_B32_vi killed $sgpr0, 0, undef $vgpr0
+    ; CHECK: $sgpr0 = V_READLANE_B32_vi $vgpr0, 0
+    ; CHECK: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0
+    ; CHECK: $sgpr0 = V_READLANE_B32_vi killed $vgpr0, 0
+    ; CHECK: $m0 = S_MOV_B32 killed $sgpr0
+    ; CHECK: S_NOP 0
+    ; CHECK: S_SENDMSG 0, implicit $m0, implicit $exec
+    S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $m0
+    S_NOP 0, implicit %0, implicit-def %3:sreg_32, implicit-def %4:sreg_32
+    $m0 = COPY %0
+    S_SENDMSG 0, implicit $m0, implicit $exec
+
+...