[llvm] 4a32cd1 - [AMDGPU] Remove unnecessary v_mov from a register to itself in WQM lowering.

Fri Jan 10 20:02:24 PST 2020

Author: Michael Bedy
Date: 2020-01-10T23:01:19-05:00
New Revision: 4a32cd11acd7c38f5e0b587d724935ab7a9938a6

URL: https://github.com/llvm/llvm-project/commit/4a32cd11acd7c38f5e0b587d724935ab7a9938a6
DIFF: https://github.com/llvm/llvm-project/commit/4a32cd11acd7c38f5e0b587d724935ab7a9938a6.diff

LOG: [AMDGPU] Remove unnecessary v_mov from a register to itself in WQM lowering.

Summary:
- SI Whole Quad Mode phase is replacing WQM pseudo instructions with v_mov instructions.
While this is necessary for the special handling of moving results out of WWM live ranges,
it is not necessary for WQM live ranges. The result is a v_mov from a register to itself after every
WQM operation. This change uses a COPY psuedo in these cases, which allows the register
allocator to coalesce the moves away.

Reviewers: tpr, dstuttard, foad, nhaehnle

Reviewed By: nhaehnle

Subscribers: arsenm, kzhuravl, jvesely, wdng, nhaehnle, yaxunl, dstuttard, tpr, t-tye, hiraditya, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71386

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
    llvm/test/CodeGen/AMDGPU/wqm.ll
    llvm/test/CodeGen/AMDGPU/wqm.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 600b80d83004..39f5df767977 100644

--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -156,6 +156,7 @@ class SIWholeQuadMode : public MachineFunctionPass {
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
   DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
   SmallVector<MachineInstr *, 1> LiveMaskQueries;
+  SmallVector<MachineInstr *, 4> LowerToMovInstrs;
   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
 
   void printInfo();
@@ -352,7 +353,7 @@ char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
         // inactive lanes.
         markInstructionUses(MI, StateWWM, Worklist);
         GlobalFlags |= StateWWM;
-        LowerToCopyInstrs.push_back(&MI);
+        LowerToMovInstrs.push_back(&MI);
         continue;
       } else if (Opcode == AMDGPU::V_SET_INACTIVE_B32 ||
                  Opcode == AMDGPU::V_SET_INACTIVE_B64) {
@@ -852,9 +853,8 @@ void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
 }
 
 void SIWholeQuadMode::lowerCopyInstrs() {
-  for (MachineInstr *MI : LowerToCopyInstrs) {
-    for (unsigned i = MI->getNumExplicitOperands() - 1; i > 1; i--)
-      MI->RemoveOperand(i);
+  for (MachineInstr *MI : LowerToMovInstrs) {
+    assert(MI->getNumExplicitOperands() == 2);
 
     const Register Reg = MI->getOperand(0).getReg();
 
@@ -872,6 +872,22 @@ void SIWholeQuadMode::lowerCopyInstrs() {
       MI->setDesc(TII->get(AMDGPU::COPY));
     }
   }
+  for (MachineInstr *MI : LowerToCopyInstrs) {
+    if (MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B32 ||
+        MI->getOpcode() == AMDGPU::V_SET_INACTIVE_B64) {
+      assert(MI->getNumExplicitOperands() == 3);
+      // the only reason we should be here is V_SET_INACTIVE has
+      // an undef input so it is being replaced by a simple copy.
+      // There should be a second undef source that we should remove.
+      assert(MI->getOperand(2).isUndef());
+      MI->RemoveOperand(2);
+      MI->untieRegOperand(1);
+    } else {
+      assert(MI->getNumExplicitOperands() == 2);
+    }
+
+    MI->setDesc(TII->get(AMDGPU::COPY));
+  }
 }
 
 bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
@@ -879,6 +895,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   Blocks.clear();
   LiveMaskQueries.clear();
   LowerToCopyInstrs.clear();
+  LowerToMovInstrs.clear();
   CallingConv = MF.getFunction().getCallingConv();
 
   ST = &MF.getSubtarget<GCNSubtarget>();
@@ -893,7 +910,7 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   unsigned Exec = ST->isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
   if (!(GlobalFlags & StateWQM)) {
     lowerLiveMaskQueries(Exec);
-    if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty())
+    if (!(GlobalFlags & StateWWM) && LowerToCopyInstrs.empty() && LowerToMovInstrs.empty())
       return !LiveMaskQueries.empty();
   } else {
     // Store a copy of the original live mask when required

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll
index b827668950b2..b799c2b5993d 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.ll
+++ b/llvm/test/CodeGen/AMDGPU/wqm.ll
@@ -117,6 +117,9 @@ main_body:
 ;CHECK: buffer_load_dword
 ;CHECK: buffer_load_dword
 ;CHECK: v_add_f32_e32
+; WQM was inserting an unecessary v_mov to self after the v_add. Make sure this
+; does not happen - the v_add should write the return reg directly.
+;CHECK-NOT: v_mov_b32_e32
 define amdgpu_ps float @test5(i32 inreg %idx0, i32 inreg %idx1) {
 main_body:
   %src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/wqm.mir b/llvm/test/CodeGen/AMDGPU/wqm.mir
index a5009cc7924f..ec285db3aaba 100644
--- a/llvm/test/CodeGen/AMDGPU/wqm.mir
+++ b/llvm/test/CodeGen/AMDGPU/wqm.mir
@@ -48,3 +48,62 @@ body:             |
     SI_RETURN_TO_EPILOG $vgpr0
 
 ...
+
+---
+# V_SET_INACTIVE, when its second operand is undef, is replaced by a
+# COPY by si-wqm. Ensure the instruction is removed.
+#CHECK-NOT: V_SET_INACTIVE
+name:            no_cfg
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+registers:
+  - { id: 0, class: sgpr_32, preferred-register: '' }
+  - { id: 1, class: sgpr_32, preferred-register: '' }
+  - { id: 2, class: sgpr_32, preferred-register: '' }
+  - { id: 3, class: sgpr_32, preferred-register: '' }
+  - { id: 4, class: sgpr_32, preferred-register: '' }
+  - { id: 5, class: sgpr_128, preferred-register: '' }
+  - { id: 6, class: sgpr_128, preferred-register: '' }
+  - { id: 7, class: sreg_32, preferred-register: '' }
+  - { id: 8, class: vreg_64, preferred-register: '' }
+  - { id: 9, class: sreg_32, preferred-register: '' }
+  - { id: 10, class: vgpr_32, preferred-register: '' }
+  - { id: 11, class: vgpr_32, preferred-register: '' }
+  - { id: 12, class: sreg_32, preferred-register: '' }
+  - { id: 13, class: vgpr_32, preferred-register: '' }
+  - { id: 14, class: vgpr_32, preferred-register: '' }
+  - { id: 15, class: vgpr_32, preferred-register: '' }
+  - { id: 16, class: vgpr_32, preferred-register: '' }
+liveins:
+  - { reg: '$sgpr0', virtual-reg: '%0' }
+  - { reg: '$sgpr1', virtual-reg: '%1' }
+  - { reg: '$sgpr2', virtual-reg: '%2' }
+  - { reg: '$sgpr3', virtual-reg: '%3' }
+body:             |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3
+
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %6:sgpr_128 = REG_SEQUENCE %0, %subreg.sub0, %1, %subreg.sub1, %2, %subreg.sub2, %3, %subreg.sub3
+    %5:sgpr_128 = COPY %6
+    %7:sreg_32 = S_MOV_B32 0
+    %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %6, %7, 0, 0, 0, 0, 0, 0, implicit $exec
+    %16:vgpr_32 = COPY %8.sub1
+    %11:vgpr_32 = COPY %16
+    %10:vgpr_32 = V_SET_INACTIVE_B32 %11, undef %12:sreg_32, implicit $exec
+    %14:vgpr_32 = COPY %7
+    %13:vgpr_32 = V_MOV_B32_dpp %14, killed %10, 323, 12, 15, 0, implicit $exec
+    early-clobber %15:vgpr_32 = WWM killed %13, implicit $exec
+    BUFFER_STORE_DWORD_OFFSET_exact killed %15, %6, %7, 4, 0, 0, 0, 0, 0, implicit $exec
+    S_ENDPGM 0
+
+...