[llvm] [TwoAddressInstruction] Recompute live intervals for partial defs (PR #74431)

Tue Dec 5 00:05:20 PST 2023

https://github.com/perlfu created https://github.com/llvm/llvm-project/pull/74431

Force live interval recomputation for a register if its definition is narrowed to become partial.  The live interval repair process cannot otherwise detect these changes.

>From f6f557977d248e18c8585a3d6ac73f2727219c61 Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson at amd.com>
Date: Thu, 30 Nov 2023 15:20:43 +0900
Subject: [PATCH] [TwoAddressInstruction] Recompute live intervals for partial
 defs

Force live interval recomputation for a register if its definition
is narrowed to become partial.  The live interval repair process
cannot otherwise detect these changes.
---
 .../lib/CodeGen/TwoAddressInstructionPass.cpp | 13 ++++-
 .../early-lis-two-address-partial-def.mir     | 52 +++++++++++++++++++
 2 files changed, 63 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir

diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index bf689dbd308f7..9ffcf3f95edac 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1937,13 +1937,16 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
   }
 
   bool DefEmitted = false;
+  bool DefIsPartial = false;
   for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) {
     MachineOperand &UseMO = MI.getOperand(i);
     Register SrcReg = UseMO.getReg();
     unsigned SubIdx = MI.getOperand(i+1).getImm();
     // Nothing needs to be inserted for undef operands.
-    if (UseMO.isUndef())
+    if (UseMO.isUndef()) {
+      DefIsPartial = true;
       continue;
+    }
 
     // Defer any kill flag to the last operand using SrcReg. Otherwise, we
     // might insert a COPY that uses SrcReg after is was killed.
@@ -1988,8 +1991,14 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
     for (int j = MI.getNumOperands() - 1, ee = 0; j > ee; --j)
       MI.removeOperand(j);
   } else {
-    if (LIS)
+    if (LIS) {
+      // Force interval recomputation if we moved from full definition
+      // of register to partial.
+      if (DefIsPartial && LIS->hasInterval(DstReg) &&
+          MRI->shouldTrackSubRegLiveness(DstReg))
+        LIS->removeInterval(DstReg);
       LIS->RemoveMachineInstrFromMaps(MI);
+    }
 
     LLVM_DEBUG(dbgs() << "Eliminated: " << MI);
     MI.eraseFromParent();
diff --git a/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir b/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir
new file mode 100644
index 0000000000000..7a151c4530a03
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/early-lis-two-address-partial-def.mir
@@ -0,0 +1,52 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -run-pass=liveintervals -run-pass=twoaddressinstruction -verify-machineinstrs -o - %s | FileCheck --check-prefix=GFX90A %s
+
+---
+name:            aligned_partial_vgpr_64
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1
+
+    ; GFX90A-LABEL: name: aligned_partial_vgpr_64
+    ; GFX90A: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $sgpr4, $sgpr5, $sgpr6, $sgpr7, $vgpr0, $vgpr1
+    ; GFX90A-NEXT: {{  $}}
+    ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX90A-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX90A-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr7
+    ; GFX90A-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr6
+    ; GFX90A-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr5
+    ; GFX90A-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+    ; GFX90A-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+    ; GFX90A-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+    ; GFX90A-NEXT: [[COPY8:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+    ; GFX90A-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+    ; GFX90A-NEXT: undef [[COPY10:%[0-9]+]].sub0:sgpr_256 = COPY [[COPY9]]
+    ; GFX90A-NEXT: [[COPY10:%[0-9]+]].sub1:sgpr_256 = COPY [[COPY8]]
+    ; GFX90A-NEXT: [[COPY10:%[0-9]+]].sub2:sgpr_256 = COPY [[COPY7]]
+    ; GFX90A-NEXT: [[COPY10:%[0-9]+]].sub3:sgpr_256 = COPY [[COPY6]]
+    ; GFX90A-NEXT: [[COPY10:%[0-9]+]].sub4:sgpr_256 = COPY [[COPY5]]
+    ; GFX90A-NEXT: [[COPY10:%[0-9]+]].sub5:sgpr_256 = COPY [[COPY4]]
+    ; GFX90A-NEXT: [[COPY10:%[0-9]+]].sub6:sgpr_256 = COPY [[COPY3]]
+    ; GFX90A-NEXT: [[COPY10:%[0-9]+]].sub7:sgpr_256 = COPY [[COPY2]]
+    ; GFX90A-NEXT: undef [[COPY11:%[0-9]+]].sub0:vreg_64_align2 = COPY [[COPY]]
+    ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[COPY1]]
+    ; GFX90A-NEXT: [[COPY12:%[0-9]+]]:vgpr_32 = IMAGE_ATOMIC_SWAP_V1_V1_gfx90a [[COPY12]], [[COPY11]].sub0, [[COPY10]], 1, -1, 1, 0, 0, 0, implicit $exec, implicit [[COPY11]] :: (volatile dereferenceable load store (s32), addrspace 8)
+    ; GFX90A-NEXT: $vgpr0 = COPY [[COPY12]]
+    ; GFX90A-NEXT: SI_RETURN_TO_EPILOG $vgpr0
+    %9:vgpr_32 = COPY $vgpr1
+    %8:vgpr_32 = COPY $vgpr0
+    %7:sgpr_32 = COPY $sgpr7
+    %6:sgpr_32 = COPY $sgpr6
+    %5:sgpr_32 = COPY $sgpr5
+    %4:sgpr_32 = COPY $sgpr4
+    %3:sgpr_32 = COPY $sgpr3
+    %2:sgpr_32 = COPY $sgpr2
+    %1:sgpr_32 = COPY $sgpr1
+    %0:sgpr_32 = COPY $sgpr0
+    %11:sgpr_256 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3, %4:sgpr_32, %subreg.sub4, %5:sgpr_32, %subreg.sub5, %6:sgpr_32, %subreg.sub6, %7:sgpr_32, %subreg.sub7
+    %14:vreg_64_align2 = REG_SEQUENCE %9:vgpr_32, %subreg.sub0, undef %13:vgpr_32, %subreg.sub1
+    %12:vgpr_32 = IMAGE_ATOMIC_SWAP_V1_V1_gfx90a %8:vgpr_32, %14.sub0:vreg_64_align2, %11:sgpr_256, 1, -1, 1, 0, 0, 0, implicit $exec, implicit %14:vreg_64_align2 :: (volatile dereferenceable load store (s32), addrspace 8)
+    $vgpr0 = COPY %12:vgpr_32
+    SI_RETURN_TO_EPILOG $vgpr0
+...