[llvm] d0f6641 - [AMDGPU] Fix liveness for loops in si-optimize-exec-masking-pre-ra

Carl Ritson via llvm-commits llvm-commits at lists.llvm.org
Wed Jun 29 23:29:42 PDT 2022


Author: Carl Ritson
Date: 2022-06-30T15:26:50+09:00
New Revision: d0f6641615755e3dd27bc71390e6ae866dfef9ec

URL: https://github.com/llvm/llvm-project/commit/d0f6641615755e3dd27bc71390e6ae866dfef9ec
DIFF: https://github.com/llvm/llvm-project/commit/d0f6641615755e3dd27bc71390e6ae866dfef9ec.diff

LOG: [AMDGPU] Fix liveness for loops in si-optimize-exec-masking-pre-ra

Follow up to D127894, new liveness update code needs to handle
the case where S_ANDN2 input must be extended through loops when
V_CNDMASK_B32 has been hoisted.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D128800

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
    llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness.mir

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index aba262dfa693..e5e65a8dbbf1 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -206,40 +206,25 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
 
   LiveInterval *CmpLI =
       CmpReg.isVirtual() ? &LIS->getInterval(CmpReg) : nullptr;
+  LiveInterval *SelLI =
+      SelReg.isVirtual() ? &LIS->getInterval(SelReg) : nullptr;
 
-  // Try to remove compare. Cmp value should not used in between of cmp
-  // and s_and_b64 if VCC or just unused if any other register.
-  if ((CmpReg.isVirtual() && CmpLI->Query(AndIdx.getRegSlot()).isKill()) ||
-      (CmpReg == Register(CondReg) &&
-       std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
-                    [&](const MachineInstr &MI) {
-                      return MI.readsRegister(CondReg, TRI);
-                    }))) {
-    LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
-    if (CmpLI)
-      LIS->removeVRegDefAt(*CmpLI, CmpIdx.getRegSlot());
-    LIS->RemoveMachineInstrFromMaps(*Cmp);
-    Cmp->eraseFromParent();
-
-    LiveInterval *SelLI =
-        SelReg.isVirtual() ? &LIS->getInterval(SelReg) : nullptr;
-    // Try to remove v_cndmask_b32.
-    if (SelLI && SelLI->Query(CmpIdx.getRegSlot()).isKill()) {
-      LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
-
-      if (SelLI)
-        LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot());
-      LIS->RemoveMachineInstrFromMaps(*Sel);
-      Sel->eraseFromParent();
-    }
-  }
-
+  // Update live intervals for CCReg before potentially removing CmpReg/SelReg,
+  // and their associated liveness information.
   if (CCReg.isVirtual()) {
+    // Note: this ignores that SelLI might have multiple internal values
+    // or splits and simply extends the live range to cover all cases
+    // where the result of the v_cndmask_b32 was live (e.g. loops).
+    // This could yield worse register allocation in rare edge cases.
+    SlotIndex EndIdx = AndIdx.getRegSlot();
+    if (SelLI && SelLI->endIndex() > EndIdx && SelLI->endIndex().isBlock())
+      EndIdx = SelLI->endIndex();
+
     LiveInterval &CCLI = LIS->getInterval(CCReg);
     auto CCQ = CCLI.Query(SelIdx.getRegSlot());
     if (CCQ.valueIn()) {
       CCLI.addSegment(LiveRange::Segment(SelIdx.getRegSlot(),
-                                         AndIdx.getRegSlot(), CCQ.valueIn()));
+                                         EndIdx, CCQ.valueIn()));
     }
 
     if (CC->getSubReg()) {
@@ -251,7 +236,7 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
             auto CCQS = SR.Query(SelIdx.getRegSlot());
             if (CCQS.valueIn()) {
               SR.addSegment(LiveRange::Segment(
-                  SelIdx.getRegSlot(), AndIdx.getRegSlot(), CCQS.valueIn()));
+                  SelIdx.getRegSlot(), EndIdx, CCQS.valueIn()));
             }
           },
           *LIS->getSlotIndexes(), *TRI);
@@ -263,6 +248,38 @@ bool SIOptimizeExecMaskingPreRA::optimizeVcndVcmpPair(MachineBasicBlock &MBB) {
   } else
     LIS->removeAllRegUnitsForPhysReg(CCReg);
 
+  // Try to remove compare. Cmp value should not used in between of cmp
+  // and s_and_b64 if VCC or just unused if any other register.
+  if ((CmpReg.isVirtual() && CmpLI && CmpLI->Query(AndIdx.getRegSlot()).isKill()) ||
+      (CmpReg == Register(CondReg) &&
+       std::none_of(std::next(Cmp->getIterator()), Andn2->getIterator(),
+                    [&](const MachineInstr &MI) {
+                      return MI.readsRegister(CondReg, TRI);
+                    }))) {
+    LLVM_DEBUG(dbgs() << "Erasing: " << *Cmp << '\n');
+    if (CmpLI)
+      LIS->removeVRegDefAt(*CmpLI, CmpIdx.getRegSlot());
+    LIS->RemoveMachineInstrFromMaps(*Cmp);
+    Cmp->eraseFromParent();
+
+    // Try to remove v_cndmask_b32.
+    if (SelLI) {
+      bool CanRemoveSel = SelLI->Query(CmpIdx.getRegSlot()).isKill();
+      if (!CanRemoveSel) {
+        // Try to shrink the live interval and check for dead def instead.
+        LIS->shrinkToUses(SelLI, nullptr);
+        CanRemoveSel = SelLI->Query(SelIdx.getRegSlot()).isDeadDef();
+      }
+      if (CanRemoveSel) {
+        LLVM_DEBUG(dbgs() << "Erasing: " << *Sel << '\n');
+
+        LIS->removeVRegDefAt(*SelLI, SelIdx.getRegSlot());
+        LIS->RemoveMachineInstrFromMaps(*Sel);
+        Sel->eraseFromParent();
+      }
+    }
+  }
+
   return true;
 }
 

diff  --git a/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness.mir b/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness.mir
index 501f1654d7c8..9df269e66ba6 100644
--- a/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness.mir
+++ b/llvm/test/CodeGen/AMDGPU/opt-exec-masking-pre-ra-update-liveness.mir
@@ -355,3 +355,121 @@ body:             |
 
   bb.4:
 ...
+
+# Liveness of V_CNDMASK_B32 source (%0) must be extended through loop.
+
+---
+name:            cndmask_loop_cndmask
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: cndmask_loop_cndmask
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.2, implicit undef $vcc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.4, implicit $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc = S_ANDN2_B64 $exec, [[DEF1]], implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.4, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.3
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_BRANCH %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  bb.0:
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:sreg_32 = IMPLICIT_DEF
+    %2:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    S_CBRANCH_VCCNZ %bb.2, implicit undef $vcc
+
+  bb.1:
+    %1:sreg_32 = S_ADD_I32 %1, -1, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.4, implicit $scc
+
+  bb.2:
+    %4:sreg_64_xexec = V_CMP_NE_U32_e64 1, %2, implicit $exec
+    $vcc = S_AND_B64 $exec, %4, implicit-def $scc
+    S_CBRANCH_VCCNZ %bb.4, implicit $vcc
+    S_BRANCH %bb.3
+
+  bb.3:
+    S_BRANCH %bb.1
+
+  bb.4:
+...
+
+---
+name:            cndmask_loop_cndmask_split
+tracksRegLiveness: true
+body:             |
+  ; CHECK-LABEL: name: cndmask_loop_cndmask_split
+  ; CHECK: bb.0:
+  ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; CHECK-NEXT:   [[DEF1:%[0-9]+]]:sreg_64_xexec = IMPLICIT_DEF
+  ; CHECK-NEXT:   S_CBRANCH_VCCZ %bb.1, implicit $vcc
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1:
+  ; CHECK-NEXT:   successors: %bb.5(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_BRANCH %bb.5
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2:
+  ; CHECK-NEXT:   successors: %bb.4(0x40000000), %bb.3(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $vcc = S_ANDN2_B64 $exec, [[DEF1]], implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_VCCNZ %bb.4, implicit $vcc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.3:
+  ; CHECK-NEXT:   successors: %bb.5(0x40000000), %bb.4(0x40000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[S_ADD_I32_:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_ADD_I32_]], -1, implicit-def $scc
+  ; CHECK-NEXT:   S_CBRANCH_SCC0 %bb.5, implicit $scc
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.4:
+  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   S_BRANCH %bb.2
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.5:
+  bb.0:
+    $vcc = IMPLICIT_DEF
+    %0:sreg_64_xexec = IMPLICIT_DEF
+    %1:sreg_32 = IMPLICIT_DEF
+    %2:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, 1, %0, implicit $exec
+    S_CBRANCH_VCCZ %bb.5, implicit $vcc
+    S_BRANCH %bb.1
+
+  bb.5:
+    S_BRANCH %bb.4
+
+  bb.1:
+    %4:sreg_64_xexec = V_CMP_NE_U32_e64 1, %2, implicit $exec
+    $vcc = S_AND_B64 $exec, %4, implicit-def $scc
+    S_CBRANCH_VCCNZ %bb.3, implicit $vcc
+
+  bb.2:
+    %1:sreg_32 = S_ADD_I32 %1, -1, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.4, implicit $scc
+
+  bb.3:
+    S_BRANCH %bb.1
+
+  bb.4:
+...


        


More information about the llvm-commits mailing list