[llvm] VirtRegRewriter: Add super register defs for live out undef lanes (PR #112679)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Mon Oct 21 16:41:33 PDT 2024
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/112679
>From 7a2b0b57e4d4c61b160abaae210938c1e44ba438 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Sat, 17 Aug 2024 10:36:09 +0400
Subject: [PATCH 1/4] VirtRegRewriter: Add super register defs for live out
undef lanes
If an undef subregister def is live into another block, we need to
maintain a physreg def to track the liveness of those lanes. This
would manifest a verifier error after branch folding, when the cloned
tail block use no longer had a def.
There is a missing verifier check for this situation. Added an xfailed
test that demonstrates this. We may also be able to revert the changes
in 47d3cbcf842a036c20c3f1c74255cdfc213f41c2.
It might be better to insert an IMPLICIT_DEF before the instruction
rather than using the implicit-def operand.
Fixes #98474
---
llvm/lib/CodeGen/VirtRegMap.cpp | 28 ++
.../branch-folding-implicit-def-subreg.ll | 65 +++--
llvm/test/CodeGen/AMDGPU/indirect-call.ll | 2 -
...nfloop-subrange-spill-inspect-subrange.mir | 4 +-
.../CodeGen/AMDGPU/infloop-subrange-spill.mir | 4 +-
...474-need-live-out-undef-subregister-def.ll | 42 +++
...egrewriter-live-out-undef-subregisters.mir | 251 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/itofp.i128.ll | 8 +-
.../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 2 +-
...ssing-def-liveout-physical-subregister.mir | 36 +++
10 files changed, 396 insertions(+), 46 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir
create mode 100644 llvm/test/MachineVerifier/AMDGPU/issue98474-missing-def-liveout-physical-subregister.mir
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index a548bf665bcea8..1254c7be182146 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -199,6 +199,9 @@ class VirtRegRewriter : public MachineFunctionPass {
void handleIdentityCopy(MachineInstr &MI);
void expandCopyBundle(MachineInstr &MI) const;
bool subRegLiveThrough(const MachineInstr &MI, MCRegister SuperPhysReg) const;
+ bool needLiveOutUndefSubregDef(const LiveInterval &LI,
+ const MachineBasicBlock &MBB, unsigned SubReg,
+ MCPhysReg PhysReg) const;
public:
static char ID;
@@ -532,6 +535,26 @@ bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI,
return false;
}
+/// Check if we need to maintain liveness for undef subregister lanes that are
+/// live out of a block.
+bool VirtRegRewriter::needLiveOutUndefSubregDef(const LiveInterval &LI,
+ const MachineBasicBlock &MBB,
+ unsigned SubReg,
+ MCPhysReg PhysReg) const {
+ LaneBitmask UndefMask = ~TRI->getSubRegIndexLaneMask(SubReg);
+ for (const LiveInterval::SubRange &SR : LI.subranges()) {
+ LaneBitmask NeedImpDefLanes = UndefMask & SR.LaneMask;
+ if (NeedImpDefLanes.any() && !LIS->isLiveOutOfMBB(SR, &MBB)) {
+ for (const MachineBasicBlock *Succ : MBB.successors()) {
+ if (LIS->isLiveInToMBB(SR, Succ))
+ return true;
+ }
+ }
+ }
+
+ return false;
+}
+
void VirtRegRewriter::rewrite() {
bool NoSubRegLiveness = !MRI->subRegLivenessEnabled();
SmallVector<Register, 8> SuperDeads;
@@ -586,6 +609,11 @@ void VirtRegRewriter::rewrite() {
MO.setIsUndef(true);
} else if (!MO.isDead()) {
assert(MO.isDef());
+ if (MO.isUndef()) {
+ const LiveInterval &LI = LIS->getInterval(VirtReg);
+ if (needLiveOutUndefSubregDef(LI, *MBBI, SubReg, PhysReg))
+ SuperDefs.push_back(PhysReg);
+ }
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
index 86254329923971..055e9850de3d68 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll
@@ -38,24 +38,19 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr30_sgpr31 = S_MOV_B64 0
; GFX90A-NEXT: renamable $vcc = S_AND_B64 $exec, renamable $sgpr26_sgpr27, implicit-def dead $scc
- ; GFX90A-NEXT: $vgpr22 = IMPLICIT_DEF
- ; GFX90A-NEXT: $vgpr10 = IMPLICIT_DEF
- ; GFX90A-NEXT: $vgpr24 = IMPLICIT_DEF
- ; GFX90A-NEXT: $vgpr18 = IMPLICIT_DEF
- ; GFX90A-NEXT: $vgpr20 = IMPLICIT_DEF
; GFX90A-NEXT: S_CBRANCH_VCCNZ %bb.59, implicit $vcc
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.2:
; GFX90A-NEXT: successors: %bb.3(0x80000000)
- ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $vgpr22, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54, $sgpr55, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $vgpr2, $vgpr3, $vgpr10, $vgpr24, $vgpr18, $vgpr20
+ ; GFX90A-NEXT: liveins: $sgpr12, $sgpr13, $sgpr14, $sgpr33, $vgpr31, $sgpr4_sgpr5, $sgpr6, $sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr24_sgpr25, $sgpr26_sgpr27, $sgpr30_sgpr31, $sgpr42_sgpr43, $sgpr54, $sgpr55, $sgpr16_sgpr17_sgpr18, $sgpr18_sgpr19, $sgpr20_sgpr21_sgpr22, $vgpr2, $vgpr3
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $sgpr23 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr23 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr25 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
+ ; GFX90A-NEXT: renamable $vgpr19 = IMPLICIT_DEF implicit-def $vgpr18
+ ; GFX90A-NEXT: renamable $vgpr21 = IMPLICIT_DEF implicit-def $vgpr20
+ ; GFX90A-NEXT: renamable $vgpr23 = IMPLICIT_DEF implicit-def $vgpr22
+ ; GFX90A-NEXT: renamable $vgpr25 = IMPLICIT_DEF implicit-def $vgpr24
; GFX90A-NEXT: renamable $sgpr28_sgpr29 = S_MOV_B64 0
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.3.Flow17:
@@ -111,8 +106,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+ ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF
; GFX90A-NEXT: {{ $}}
; GFX90A-NEXT: bb.6.Flow20:
@@ -395,8 +390,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+ ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF
; GFX90A-NEXT: $sgpr30_sgpr31 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.37, implicit $exec
@@ -434,8 +429,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+ ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF
; GFX90A-NEXT: $sgpr36_sgpr37 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.39, implicit $exec
@@ -484,8 +479,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+ ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF
; GFX90A-NEXT: $sgpr38_sgpr39 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.41, implicit $exec
@@ -535,8 +530,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+ ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF
; GFX90A-NEXT: $sgpr40_sgpr41 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.47, implicit $exec
@@ -589,8 +584,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+ ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $sgpr44_sgpr45 = S_MOV_B64 0
; GFX90A-NEXT: {{ $}}
@@ -643,8 +638,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+ ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF
; GFX90A-NEXT: $sgpr16_sgpr17 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.43, implicit $exec
@@ -689,8 +684,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+ ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF
; GFX90A-NEXT: S_BRANCH %bb.45
; GFX90A-NEXT: {{ $}}
@@ -719,8 +714,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+ ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF
; GFX90A-NEXT: S_BRANCH %bb.46
; GFX90A-NEXT: {{ $}}
@@ -748,8 +743,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+ ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF
; GFX90A-NEXT: S_BRANCH %bb.62
; GFX90A-NEXT: {{ $}}
@@ -773,8 +768,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+ ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: renamable $sgpr15 = IMPLICIT_DEF
; GFX90A-NEXT: $sgpr58_sgpr59 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.53, implicit $exec
@@ -880,8 +875,8 @@ define amdgpu_kernel void @f1(ptr addrspace(1) %arg, ptr addrspace(1) %arg1, i64
; GFX90A-NEXT: renamable $vgpr52 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr16 = IMPLICIT_DEF
; GFX90A-NEXT: renamable $vgpr53 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF
- ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF
+ ; GFX90A-NEXT: renamable $vgpr13 = IMPLICIT_DEF implicit-def $vgpr12
+ ; GFX90A-NEXT: renamable $vgpr11 = IMPLICIT_DEF implicit-def $vgpr10
; GFX90A-NEXT: $sgpr50_sgpr51 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec
; GFX90A-NEXT: S_CBRANCH_EXECNZ %bb.57, implicit $exec
; GFX90A-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index da8aa544698355..e819d5d3b1656e 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -603,7 +603,6 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GISEL-NEXT: s_mov_b32 s14, s43
; GISEL-NEXT: s_mov_b32 s15, s42
; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; GISEL-NEXT: v_mov_b32_e32 v1, v0
; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: ; implicit-def: $vgpr31
; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49]
@@ -1384,7 +1383,6 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
-; GISEL-NEXT: v_mov_b32_e32 v2, v0
; GISEL-NEXT: ; implicit-def: $vgpr1
; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: s_xor_b64 exec, exec, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir
index 7864564d289178..285e7e22264a04 100644
--- a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir
+++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir
@@ -30,7 +30,7 @@ body: |
; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: dead undef [[DEF2:%[0-9]+]].sub0:vreg_64 = IMPLICIT_DEF
; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX16_IMM renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
- ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF implicit-def $sgpr24_sgpr25_sgpr26_sgpr27
; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
; CHECK-NEXT: $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec
@@ -83,7 +83,7 @@ body: |
; CHECK-NEXT: liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
- ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24
+ ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit undef $vcc
; CHECK-NEXT: S_BRANCH %bb.6
; CHECK-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir
index 1030cdb1b43fc1..995a5d267fbed1 100644
--- a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir
@@ -30,7 +30,7 @@ body: |
; CHECK-NEXT: dead undef [[DEF3:%[0-9]+]].sub1:vreg_64 = IMPLICIT_DEF
; CHECK-NEXT: dead renamable $sgpr5 = IMPLICIT_DEF
; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
- ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF
+ ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF implicit-def $sgpr24_sgpr25_sgpr26_sgpr27
; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
; CHECK-NEXT: $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec
@@ -80,7 +80,7 @@ body: |
; CHECK-NEXT: liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
- ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24
+ ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.6, implicit undef $vcc
; CHECK-NEXT: S_BRANCH %bb.5
; CHECK-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll b/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll
new file mode 100644
index 00000000000000..7caa563d8b2983
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/issue98474-need-live-out-undef-subregister-def.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s
+
+; Check for verifier error after tail duplication. An implicit_def of
+; a subregsiter is needed to maintain liveness after assignment.
+
+define amdgpu_vs void @test(i32 inreg %cmp, i32 %e0) {
+; CHECK-LABEL: test:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_cmp_eq_u32 s0, 0
+; CHECK-NEXT: s_mov_b32 s0, 0
+; CHECK-NEXT: s_cbranch_scc1 .LBB0_2
+; CHECK-NEXT: ; %bb.1: ; %load
+; CHECK-NEXT: s_mov_b32 s1, s0
+; CHECK-NEXT: s_mov_b32 s2, s0
+; CHECK-NEXT: s_mov_b32 s3, s0
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: buffer_load_format_xy v[1:2], v1, s[0:3], 0 idxen
+; CHECK-NEXT: s_waitcnt vmcnt(0)
+; CHECK-NEXT: exp mrt0 v0, v1, v2, v0
+; CHECK-NEXT: s_endpgm
+; CHECK-NEXT: .LBB0_2:
+; CHECK-NEXT: v_mov_b32_e32 v1, 0
+; CHECK-NEXT: exp mrt0 v0, v1, v2, v0
+; CHECK-NEXT: s_endpgm
+entry:
+ %cond = icmp eq i32 %cmp, 0
+ br i1 %cond, label %end, label %load
+
+load:
+ %data1 = call <2 x i32> @llvm.amdgcn.struct.buffer.load.format.v2i32(<4 x i32> zeroinitializer, i32 0, i32 0, i32 0, i32 0)
+ %e1 = extractelement <2 x i32> %data1, i32 0
+ %e2 = extractelement <2 x i32> %data1, i32 1
+ br label %end
+
+end:
+ %out1 = phi i32 [ 0, %entry ], [ %e1, %load ]
+ %out2 = phi i32 [ poison, %entry ], [ %e2, %load ]
+ call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 %e0, i32 %out1, i32 %out2, i32 %e0, i1 false, i1 false)
+ ret void
+}
+
diff --git a/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir b/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir
new file mode 100644
index 00000000000000..a8ed114f8cd783
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir
@@ -0,0 +1,251 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -start-before=greedy,2 -stop-after=tailduplication -verify-machineinstrs -o - %s | FileCheck %s
+
+# The partial def of %0 introduces a live out undef def of %0.sub1
+# into bb.3. We need to maintain this liveness with an explicit def of
+# the physical subregister. Without this, a verifier error would
+# appear after tail duplication.
+
+---
+name: undef_subreg_def_live_out_tailduplicate_vreg64_undef_sub1
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ ; CHECK-LABEL: name: undef_subreg_def_live_out_tailduplicate_vreg64_undef_sub1
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = BUFFER_LOAD_FORMAT_XY_IDXEN killed renamable $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 8)
+ ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $sgpr0
+
+ S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+
+ bb.1:
+ undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ S_BRANCH %bb.3
+
+ bb.2:
+ S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+ %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %0:vreg_64 = BUFFER_LOAD_FORMAT_XY_IDXEN killed %1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 8)
+
+ bb.3:
+ EXP 0, killed %0.sub0, killed %0.sub1, undef %2:vgpr_32, undef %2:vgpr_32, 0, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: undef_subreg_def_live_out_tailduplicate_vreg96_undef_sub1_sub2
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ ; CHECK-LABEL: name: undef_subreg_def_live_out_tailduplicate_vreg96_undef_sub1_sub2
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
+ ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, renamable $vgpr2, undef renamable $vgpr0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2 = BUFFER_LOAD_FORMAT_XYZ_IDXEN killed renamable $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
+ ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, renamable $vgpr2, undef renamable $vgpr0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $sgpr0
+
+ S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+
+ bb.1:
+ undef %0.sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec
+ S_BRANCH %bb.3
+
+ bb.2:
+ S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+ %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %0:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_IDXEN killed %1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), addrspace 8)
+
+ bb.3:
+ EXP 0, killed %0.sub0, killed %0.sub1, killed %0.sub2, undef %2:vgpr_32, 0, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+---
+name: undef_subreg_def_live_out_tailduplicate_vreg96_undef_sub0_sub2
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ ; CHECK-LABEL: name: undef_subreg_def_live_out_tailduplicate_vreg96_undef_sub0_sub2
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
+ ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, renamable $vgpr2, undef renamable $vgpr0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1_vgpr2 = BUFFER_LOAD_FORMAT_XYZ_IDXEN killed renamable $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
+ ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, renamable $vgpr2, undef renamable $vgpr0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $sgpr0
+
+ S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+
+ bb.1:
+ undef %0.sub1:vreg_96 = V_MOV_B32_e32 0, implicit $exec
+ S_BRANCH %bb.3
+
+ bb.2:
+ S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+ %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %0:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_IDXEN killed %1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), addrspace 8)
+
+ bb.3:
+ EXP 0, killed %0.sub0, killed %0.sub1, killed %0.sub2, undef %2:vgpr_32, 0, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# Test another use of the value before the block end.
+---
+name: undef_subreg_def_live_out_tailduplicate_vreg64_undef_sub1_undef_use_in_def_block
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ ; CHECK-LABEL: name: undef_subreg_def_live_out_tailduplicate_vreg64_undef_sub1_undef_use_in_def_block
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; CHECK-NEXT: S_NOP 0, implicit renamable $vgpr0_vgpr1
+ ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = BUFFER_LOAD_FORMAT_XY_IDXEN killed renamable $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 8)
+ ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $sgpr0
+
+ S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+
+ bb.1:
+ undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ S_NOP 0, implicit %0
+ S_BRANCH %bb.3
+
+ bb.2:
+ S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+ %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %0:vreg_64 = BUFFER_LOAD_FORMAT_XY_IDXEN killed %1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 8)
+
+ bb.3:
+ EXP 0, killed %0.sub0, killed %0.sub1, undef %2:vgpr_32, undef %2:vgpr_32, 0, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
+
+# The undef subregister is not live out, no implicit def should be added for it
+---
+name: undef_subreg_def_live_out_tailduplicate_vreg64_undef_sub1_no_phi_use
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ ; CHECK-LABEL: name: undef_subreg_def_live_out_tailduplicate_vreg64_undef_sub1_no_phi_use
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $sgpr0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr0, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr0_vgpr1 = BUFFER_LOAD_FORMAT_XY_IDXEN killed renamable $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 8)
+ ; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr0, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $sgpr0
+
+ S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+
+ bb.1:
+ undef %0.sub0:vreg_64 = V_MOV_B32_e32 0, implicit $exec
+ S_BRANCH %bb.3
+
+ bb.2:
+ S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+ %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %0:vreg_64 = BUFFER_LOAD_FORMAT_XY_IDXEN killed %1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 8)
+
+ bb.3:
+ EXP 0, killed %0.sub0, killed %0.sub0, undef %2:vgpr_32, undef %2:vgpr_32, 0, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index 2999ddb8315883..7e76e2bf9e8949 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -172,8 +172,8 @@ define float @sitofp_i128_to_f32(i128 %x) {
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
-; GISEL-NEXT: ; implicit-def: $vgpr8
; GISEL-NEXT: ; implicit-def: $vgpr0
+; GISEL-NEXT: ; implicit-def: $vgpr8
; GISEL-NEXT: ; implicit-def: $vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr2
; GISEL-NEXT: ; %bb.3: ; %Flow3
@@ -417,8 +417,8 @@ define float @uitofp_i128_to_f32(i128 %x) {
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
-; GISEL-NEXT: ; implicit-def: $vgpr7
; GISEL-NEXT: ; implicit-def: $vgpr0
+; GISEL-NEXT: ; implicit-def: $vgpr7
; GISEL-NEXT: ; implicit-def: $vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr2
; GISEL-NEXT: ; %bb.3: ; %Flow3
@@ -1263,8 +1263,8 @@ define half @sitofp_i128_to_f16(i128 %x) {
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
-; GISEL-NEXT: ; implicit-def: $vgpr8
; GISEL-NEXT: ; implicit-def: $vgpr0
+; GISEL-NEXT: ; implicit-def: $vgpr8
; GISEL-NEXT: ; implicit-def: $vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr2
; GISEL-NEXT: ; %bb.3: ; %Flow3
@@ -1510,8 +1510,8 @@ define half @uitofp_i128_to_f16(i128 %x) {
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
-; GISEL-NEXT: ; implicit-def: $vgpr7
; GISEL-NEXT: ; implicit-def: $vgpr0
+; GISEL-NEXT: ; implicit-def: $vgpr7
; GISEL-NEXT: ; implicit-def: $vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr2
; GISEL-NEXT: ; %bb.3: ; %Flow3
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 2d5e5a9160fdf7..6c56dee76142c1 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -526,7 +526,6 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
-; GFX906-NEXT: ; implicit-def: $vgpr3
; GFX906-NEXT: ; implicit-def: $vgpr13
; GFX906-NEXT: ; implicit-def: $vgpr11
; GFX906-NEXT: ; implicit-def: $vgpr14
@@ -535,6 +534,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: ; implicit-def: $vgpr16
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5]
+; GFX906-NEXT: ; implicit-def: $vgpr3
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v2
; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v2
diff --git a/llvm/test/MachineVerifier/AMDGPU/issue98474-missing-def-liveout-physical-subregister.mir b/llvm/test/MachineVerifier/AMDGPU/issue98474-missing-def-liveout-physical-subregister.mir
new file mode 100644
index 00000000000000..892a4298bbdb51
--- /dev/null
+++ b/llvm/test/MachineVerifier/AMDGPU/issue98474-missing-def-liveout-physical-subregister.mir
@@ -0,0 +1,36 @@
+# XFAIL: *
+# RUN: not --crash llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -run-pass=none -filetype=null %s
+
+# FIXME: This should fail the machine verifier. There is a missing def
+# of $vgpr2 in bb.1, which is needed since it's live into bb.3
+
+---
+name: missing_live_out_subreg_def
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $sgpr0, $vgpr0
+
+ S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+
+ bb.1:
+ liveins: $vgpr0
+
+ renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ S_BRANCH %bb.3
+
+ bb.2:
+ liveins: $vgpr0
+
+ renamable $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF
+ renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec
+ renamable $vgpr1_vgpr2 = BUFFER_LOAD_FORMAT_XY_IDXEN killed renamable $vgpr1, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), addrspace 8)
+
+ bb.3:
+ liveins: $vgpr0, $vgpr1_vgpr2
+
+ EXP 0, killed renamable $vgpr0, killed renamable $vgpr1, renamable $vgpr2, renamable $vgpr0, 0, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
>From dce609b406e66c11a912c953447f55a09547f7c7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 17 Oct 2024 15:56:18 +0400
Subject: [PATCH 2/4] Try to fix clobbering lanes in other assigned lanes of
physreg
---
llvm/lib/CodeGen/VirtRegMap.cpp | 73 ++++++++++++++++++++++++++++++++-
1 file changed, 72 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index 1254c7be182146..cdb02aa0b45041 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -202,6 +202,9 @@ class VirtRegRewriter : public MachineFunctionPass {
bool needLiveOutUndefSubregDef(const LiveInterval &LI,
const MachineBasicBlock &MBB, unsigned SubReg,
MCPhysReg PhysReg) const;
+ LaneBitmask liveOutUndefPhiLanesForUndefSubregDef(
+ const LiveInterval &LI, const MachineBasicBlock &MBB, unsigned SubReg,
+ MCPhysReg PhysReg, const MachineInstr &MI) const;
public:
static char ID;
@@ -555,6 +558,41 @@ bool VirtRegRewriter::needLiveOutUndefSubregDef(const LiveInterval &LI,
return false;
}
+/// Compute a lanemask for undef lanes which need to be preserved out of the
+/// defining block for a register assignment.
+LaneBitmask VirtRegRewriter::liveOutUndefPhiLanesForUndefSubregDef(
+ const LiveInterval &LI, const MachineBasicBlock &MBB, unsigned SubReg,
+ MCPhysReg PhysReg, const MachineInstr &MI) const {
+ LaneBitmask UndefMask = ~TRI->getSubRegIndexLaneMask(SubReg);
+ LaneBitmask LiveOutUndefLanes;
+
+ for (const LiveInterval::SubRange &SR : LI.subranges()) {
+ LaneBitmask NeedImpDefLanes = UndefMask & SR.LaneMask;
+ if (NeedImpDefLanes.any() && !LIS->isLiveOutOfMBB(SR, &MBB)) {
+ for (const MachineBasicBlock *Succ : MBB.successors()) {
+ if (LIS->isLiveInToMBB(SR, Succ))
+ LiveOutUndefLanes |= NeedImpDefLanes;
+ }
+ }
+ }
+ if (LiveOutUndefLanes.none())
+ return LiveOutUndefLanes;
+
+ SlotIndex MIIndex = LIS->getInstructionIndex(MI);
+ SlotIndex BeforeMIUses = MIIndex.getBaseIndex();
+ SlotIndex AfterMIDefs = MIIndex.getBoundaryIndex();
+
+ for (MCRegUnitMaskIterator MCRU(PhysReg, TRI); MCRU.isValid(); ++MCRU) {
+ auto [RU, PhysRegMask] = *MCRU;
+
+ const LiveRange &UnitRange = LIS->getRegUnit(RU);
+ if (UnitRange.liveAt(AfterMIDefs) && UnitRange.liveAt(BeforeMIUses))
+ LiveOutUndefLanes &= ~PhysRegMask;
+ }
+
+ return LiveOutUndefLanes;
+}
+
void VirtRegRewriter::rewrite() {
bool NoSubRegLiveness = !MRI->subRegLivenessEnabled();
SmallVector<Register, 8> SuperDeads;
@@ -611,8 +649,41 @@ void VirtRegRewriter::rewrite() {
assert(MO.isDef());
if (MO.isUndef()) {
const LiveInterval &LI = LIS->getInterval(VirtReg);
- if (needLiveOutUndefSubregDef(LI, *MBBI, SubReg, PhysReg))
+
+ LaneBitmask LiveOutUndefLanes =
+ liveOutUndefPhiLanesForUndefSubregDef(LI, *MBBI, SubReg,
+ PhysReg, MI);
+ if (LiveOutUndefLanes.any()) {
+ SmallVector<unsigned, 16> CoveringIndexes;
+
+ // TODO: Just use the super register if
+ if (TRI->getCoveringSubRegIndexes(
+ *MRI, MRI->getRegClass(VirtReg), LiveOutUndefLanes,
+ CoveringIndexes)) {
+ // Try to represent the minimum needed live out def as a
+ // sequence of subregister defs.
+ //
+ // FIXME: It would be better if we could directly represent
+ // liveness with a lanemask instead of spamming operands.
+ for (unsigned SubIdx : CoveringIndexes)
+ SuperDefs.push_back(TRI->getSubReg(PhysReg, SubIdx));
+ } else {
+ // If we could not represent this as a sequence of
+ // subregisters, it's safe to replace all the lanes with a
+ // full def of the super register.
+ SuperDefs.push_back(PhysReg);
+ }
+ }
+
+ if (false &&
+ needLiveOutUndefSubregDef(LI, *MBBI, SubReg, PhysReg)) {
SuperDefs.push_back(PhysReg);
+
+ for (MCRegister AssignedSubReg : TRI->subregs(PhysReg)) {
+ if (subRegLiveThrough(MI, AssignedSubReg))
+ SuperKills.push_back(AssignedSubReg);
+ }
+ }
}
}
}
>From 785a3658196ea9c16d21c69242660f7e5d362d8a Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 18 Oct 2024 14:59:22 +0400
Subject: [PATCH 3/4] Use LiveRegMatrix and only add necessary impdefs
We apparently need to detect interference with other assigned
intervals to avoid clobbering the undef lanes defined in other
intervals, since the undef def didn't count as interference.
This is pretty ugly and adds a new dependency on LiveRegMatrix,
keeping it live for one more pass. It also adds a lot of implicit
operand spam (we really should have a better representation for this).
---
llvm/include/llvm/CodeGen/LiveRegMatrix.h | 10 ++
llvm/lib/CodeGen/LiveRegMatrix.cpp | 35 +++++
llvm/lib/CodeGen/VirtRegMap.cpp | 85 ++++---------
llvm/test/CodeGen/AMDGPU/indirect-call.ll | 2 +
...nfloop-subrange-spill-inspect-subrange.mir | 4 +-
.../CodeGen/AMDGPU/infloop-subrange-spill.mir | 4 +-
...sue98474-assigned-physreg-interference.mir | 55 ++++++++
...egrewriter-live-out-undef-subregisters.mir | 120 +++++++++++++++++-
llvm/test/CodeGen/AMDGPU/itofp.i128.ll | 8 +-
.../test/CodeGen/AMDGPU/vni8-across-blocks.ll | 2 +-
10 files changed, 253 insertions(+), 72 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/issue98474-assigned-physreg-interference.mir
diff --git a/llvm/include/llvm/CodeGen/LiveRegMatrix.h b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
index 2b32308c7c075e..55e7abd933a10b 100644
--- a/llvm/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/llvm/include/llvm/CodeGen/LiveRegMatrix.h
@@ -114,6 +114,16 @@ class LiveRegMatrix : public MachineFunctionPass {
/// the segment [Start, End).
bool checkInterference(SlotIndex Start, SlotIndex End, MCRegister PhysReg);
+ /// Check for interference in the segment [Start, End) that may prevent
+ /// assignment to PhysReg, like checkInterference. Returns a lane mask of
+ /// which lanes of the physical register interfere in the segment [Start, End)
+ /// of some other interval already assigned to PhysReg.
+ ///
+ /// If this function returns LaneBitmask::getNone(), PhysReg is completely
+ /// free at the segment [Start, End).
+ LaneBitmask checkInterferenceLanes(SlotIndex Start, SlotIndex End,
+ MCRegister PhysReg);
+
/// Assign VirtReg to PhysReg.
/// This will mark VirtReg's live range as occupied in the LiveRegMatrix and
/// update VirtRegMap. The live range is expected to be available in PhysReg.
diff --git a/llvm/lib/CodeGen/LiveRegMatrix.cpp b/llvm/lib/CodeGen/LiveRegMatrix.cpp
index c8c722359a4c44..de5e6c42a6b97c 100644
--- a/llvm/lib/CodeGen/LiveRegMatrix.cpp
+++ b/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -237,6 +237,41 @@ bool LiveRegMatrix::checkInterference(SlotIndex Start, SlotIndex End,
return false;
}
+LaneBitmask LiveRegMatrix::checkInterferenceLanes(SlotIndex Start,
+ SlotIndex End,
+ MCRegister PhysReg) {
+ // Construct artificial live range containing only one segment [Start, End).
+ VNInfo valno(0, Start);
+ LiveRange::Segment Seg(Start, End, &valno);
+ LiveRange LR;
+ LR.addSegment(Seg);
+
+ LaneBitmask InterferingLanes;
+
+ // Check for interference with that segment
+ for (MCRegUnitMaskIterator MCRU(PhysReg, TRI); MCRU.isValid(); ++MCRU) {
+ auto [Unit, Lanes] = *MCRU;
+ // LR is stack-allocated. LiveRegMatrix caches queries by a key that
+ // includes the address of the live range. If (for the same reg unit) this
+ // checkInterference overload is called twice, without any other query()
+ // calls in between (on heap-allocated LiveRanges) - which would invalidate
+ // the cached query - the LR address seen the second time may well be the
+ // same as that seen the first time, while the Start/End/valno may not - yet
+ // the same cached result would be fetched. To avoid that, we don't cache
+ // this query.
+ //
+ // FIXME: the usability of the Query API needs to be improved to avoid
+ // subtle bugs due to query identity. Avoiding caching, for example, would
+ // greatly simplify things.
+ LiveIntervalUnion::Query Q;
+ Q.reset(UserTag, LR, Matrix[Unit]);
+ if (Q.checkInterference())
+ InterferingLanes |= Lanes;
+ }
+
+ return InterferingLanes;
+}
+
Register LiveRegMatrix::getOneVReg(unsigned PhysReg) const {
const LiveInterval *VRegInterval = nullptr;
for (MCRegUnit Unit : TRI->regunits(PhysReg)) {
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index cdb02aa0b45041..e38a4633d06174 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -21,6 +21,7 @@
#include "llvm/CodeGen/LiveDebugVariables.h"
#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
#include "llvm/CodeGen/LiveStacks.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
@@ -187,6 +188,7 @@ class VirtRegRewriter : public MachineFunctionPass {
MachineRegisterInfo *MRI = nullptr;
SlotIndexes *Indexes = nullptr;
LiveIntervals *LIS = nullptr;
+ LiveRegMatrix *LRM = nullptr;
VirtRegMap *VRM = nullptr;
LiveDebugVariables *DebugVars = nullptr;
DenseSet<Register> RewriteRegs;
@@ -199,9 +201,6 @@ class VirtRegRewriter : public MachineFunctionPass {
void handleIdentityCopy(MachineInstr &MI);
void expandCopyBundle(MachineInstr &MI) const;
bool subRegLiveThrough(const MachineInstr &MI, MCRegister SuperPhysReg) const;
- bool needLiveOutUndefSubregDef(const LiveInterval &LI,
- const MachineBasicBlock &MBB, unsigned SubReg,
- MCPhysReg PhysReg) const;
LaneBitmask liveOutUndefPhiLanesForUndefSubregDef(
const LiveInterval &LI, const MachineBasicBlock &MBB, unsigned SubReg,
MCPhysReg PhysReg, const MachineInstr &MI) const;
@@ -237,6 +236,7 @@ INITIALIZE_PASS_BEGIN(VirtRegRewriter, "virtregrewriter",
INITIALIZE_PASS_DEPENDENCY(SlotIndexesWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
INITIALIZE_PASS_DEPENDENCY(LiveStacks)
INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
INITIALIZE_PASS_END(VirtRegRewriter, "virtregrewriter",
@@ -252,6 +252,7 @@ void VirtRegRewriter::getAnalysisUsage(AnalysisUsage &AU) const {
AU.addRequired<LiveStacks>();
AU.addPreserved<LiveStacks>();
AU.addRequired<VirtRegMap>();
+ AU.addRequired<LiveRegMatrix>();
if (!ClearVirtRegs)
AU.addPreserved<LiveDebugVariables>();
@@ -266,6 +267,7 @@ bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
MRI = &MF->getRegInfo();
Indexes = &getAnalysis<SlotIndexesWrapperPass>().getSI();
LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
+ LRM = &getAnalysis<LiveRegMatrix>();
VRM = &getAnalysis<VirtRegMap>();
DebugVars = &getAnalysis<LiveDebugVariables>();
LLVM_DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n"
@@ -538,26 +540,6 @@ bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI,
return false;
}
-/// Check if we need to maintain liveness for undef subregister lanes that are
-/// live out of a block.
-bool VirtRegRewriter::needLiveOutUndefSubregDef(const LiveInterval &LI,
- const MachineBasicBlock &MBB,
- unsigned SubReg,
- MCPhysReg PhysReg) const {
- LaneBitmask UndefMask = ~TRI->getSubRegIndexLaneMask(SubReg);
- for (const LiveInterval::SubRange &SR : LI.subranges()) {
- LaneBitmask NeedImpDefLanes = UndefMask & SR.LaneMask;
- if (NeedImpDefLanes.any() && !LIS->isLiveOutOfMBB(SR, &MBB)) {
- for (const MachineBasicBlock *Succ : MBB.successors()) {
- if (LIS->isLiveInToMBB(SR, Succ))
- return true;
- }
- }
- }
-
- return false;
-}
-
/// Compute a lanemask for undef lanes which need to be preserved out of the
/// defining block for a register assignment.
LaneBitmask VirtRegRewriter::liveOutUndefPhiLanesForUndefSubregDef(
@@ -575,20 +557,17 @@ LaneBitmask VirtRegRewriter::liveOutUndefPhiLanesForUndefSubregDef(
}
}
}
- if (LiveOutUndefLanes.none())
- return LiveOutUndefLanes;
SlotIndex MIIndex = LIS->getInstructionIndex(MI);
SlotIndex BeforeMIUses = MIIndex.getBaseIndex();
- SlotIndex AfterMIDefs = MIIndex.getBoundaryIndex();
-
- for (MCRegUnitMaskIterator MCRU(PhysReg, TRI); MCRU.isValid(); ++MCRU) {
- auto [RU, PhysRegMask] = *MCRU;
+ LaneBitmask InterferingLanes =
+ LRM->checkInterferenceLanes(BeforeMIUses, MIIndex.getRegSlot(), PhysReg);
+ LiveOutUndefLanes &= ~InterferingLanes;
- const LiveRange &UnitRange = LIS->getRegUnit(RU);
- if (UnitRange.liveAt(AfterMIDefs) && UnitRange.liveAt(BeforeMIUses))
- LiveOutUndefLanes &= ~PhysRegMask;
- }
+ LLVM_DEBUG(if (LiveOutUndefLanes.any()) {
+ dbgs() << "Need live out undef defs for " << printReg(PhysReg)
+ << LiveOutUndefLanes << " from " << printMBBReference(MBB) << '\n';
+ });
return LiveOutUndefLanes;
}
@@ -656,33 +635,21 @@ void VirtRegRewriter::rewrite() {
if (LiveOutUndefLanes.any()) {
SmallVector<unsigned, 16> CoveringIndexes;
- // TODO: Just use the super register if
- if (TRI->getCoveringSubRegIndexes(
+ // TODO: Just use one super register def if none of the lanes
+ // are needed?
+ if (!TRI->getCoveringSubRegIndexes(
*MRI, MRI->getRegClass(VirtReg), LiveOutUndefLanes,
- CoveringIndexes)) {
- // Try to represent the minimum needed live out def as a
- // sequence of subregister defs.
- //
- // FIXME: It would be better if we could directly represent
- // liveness with a lanemask instead of spamming operands.
- for (unsigned SubIdx : CoveringIndexes)
- SuperDefs.push_back(TRI->getSubReg(PhysReg, SubIdx));
- } else {
- // If we could not represent this as a sequence of
- // subregisters, it's safe to replace all the lanes with a
- // full def of the super register.
- SuperDefs.push_back(PhysReg);
- }
- }
-
- if (false &&
- needLiveOutUndefSubregDef(LI, *MBBI, SubReg, PhysReg)) {
- SuperDefs.push_back(PhysReg);
-
- for (MCRegister AssignedSubReg : TRI->subregs(PhysReg)) {
- if (subRegLiveThrough(MI, AssignedSubReg))
- SuperKills.push_back(AssignedSubReg);
- }
+ CoveringIndexes))
+ llvm_unreachable(
+ "cannot represent required subregister defs");
+
+ // Try to represent the minimum needed live out def as a
+ // sequence of subregister defs.
+ //
+ // FIXME: It would be better if we could directly represent
+ // liveness with a lanemask instead of spamming operands.
+ for (unsigned SubIdx : CoveringIndexes)
+ SuperDefs.push_back(TRI->getSubReg(PhysReg, SubIdx));
}
}
}
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index e819d5d3b1656e..da8aa544698355 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -603,6 +603,7 @@ define i32 @test_indirect_call_vgpr_ptr_ret(ptr %fptr) {
; GISEL-NEXT: s_mov_b32 s14, s43
; GISEL-NEXT: s_mov_b32 s15, s42
; GISEL-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GISEL-NEXT: v_mov_b32_e32 v1, v0
; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: ; implicit-def: $vgpr31
; GISEL-NEXT: s_xor_b64 exec, exec, s[48:49]
@@ -1383,6 +1384,7 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_return(i32 %i, ptr %fptr) {
; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, s[8:9], v[1:2]
; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
; GISEL-NEXT: s_swappc_b64 s[30:31], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v2, v0
; GISEL-NEXT: ; implicit-def: $vgpr1
; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: s_xor_b64 exec, exec, s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir
index 285e7e22264a04..215200c770245d 100644
--- a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir
+++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill-inspect-subrange.mir
@@ -30,7 +30,7 @@ body: |
; CHECK-NEXT: dead [[DEF1:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
; CHECK-NEXT: dead undef [[DEF2:%[0-9]+]].sub0:vreg_64 = IMPLICIT_DEF
; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX16_IMM renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
- ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF implicit-def $sgpr24_sgpr25_sgpr26_sgpr27
+ ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF implicit-def $sgpr25
; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
; CHECK-NEXT: $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.6, implicit $exec
@@ -83,7 +83,7 @@ body: |
; CHECK-NEXT: liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
- ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27
+ ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.7, implicit undef $vcc
; CHECK-NEXT: S_BRANCH %bb.6
; CHECK-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir
index 995a5d267fbed1..b8818c5550ad44 100644
--- a/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/infloop-subrange-spill.mir
@@ -30,7 +30,7 @@ body: |
; CHECK-NEXT: dead undef [[DEF3:%[0-9]+]].sub1:vreg_64 = IMPLICIT_DEF
; CHECK-NEXT: dead renamable $sgpr5 = IMPLICIT_DEF
; CHECK-NEXT: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
- ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF implicit-def $sgpr24_sgpr25_sgpr26_sgpr27
+ ; CHECK-NEXT: renamable $sgpr24 = IMPLICIT_DEF implicit-def $sgpr25
; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 = S_LOAD_DWORDX16_IMM undef renamable $sgpr4_sgpr5, 0, 0 :: (invariant load (s512), align 32, addrspace 4)
; CHECK-NEXT: $exec = S_MOV_B64_term undef renamable $sgpr4_sgpr5
; CHECK-NEXT: S_CBRANCH_EXECZ %bb.5, implicit $exec
@@ -80,7 +80,7 @@ body: |
; CHECK-NEXT: liveins: $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19:0x000000000000FFFF, $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x00000000FFFFFFFF
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: dead [[IMAGE_SAMPLE_LZ_V1_V2_5:%[0-9]+]]:vgpr_32 = IMAGE_SAMPLE_LZ_V1_V2 undef [[DEF]], renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, undef renamable $sgpr8_sgpr9_sgpr10_sgpr11, 1, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), addrspace 8)
- ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24_sgpr25_sgpr26_sgpr27
+ ; CHECK-NEXT: renamable $sgpr25 = COPY undef renamable $sgpr24, implicit-def $sgpr24
; CHECK-NEXT: S_CBRANCH_VCCNZ %bb.6, implicit undef $vcc
; CHECK-NEXT: S_BRANCH %bb.5
; CHECK-NEXT: {{ $}}
diff --git a/llvm/test/CodeGen/AMDGPU/issue98474-assigned-physreg-interference.mir b/llvm/test/CodeGen/AMDGPU/issue98474-assigned-physreg-interference.mir
new file mode 100644
index 00000000000000..786ce402038369
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/issue98474-assigned-physreg-interference.mir
@@ -0,0 +1,55 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -start-before=greedy,2 -stop-after=tailduplication -verify-machineinstrs -o - %s | FileCheck %s
+
+---
+name: undef_subreg_def_live_out_tailduplicate_vreg96_undef_sub1_sub2_assigned_physreg_interference
+tracksRegLiveness: true
+machineFunctionInfo:
+ isEntryFunction: true
+ scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99'
+ stackPtrOffsetReg: '$sgpr32'
+body: |
+ ; CHECK-LABEL: name: undef_subreg_def_live_out_tailduplicate_vreg96_undef_sub1_sub2_assigned_physreg_interference
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; CHECK-NEXT: liveins: $sgpr0, $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_CMP_EQ_U32 $sgpr0, 0, implicit-def $scc
+ ; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: liveins: $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $vgpr3 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr4_vgpr5
+ ; CHECK-NEXT: EXP 0, killed renamable $vgpr3, renamable $vgpr4, renamable $vgpr5, killed renamable $vgpr2, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vgpr2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr3_vgpr4_vgpr5 = BUFFER_LOAD_FORMAT_XYZ_IDXEN killed renamable $vgpr0, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), align 16, addrspace 8)
+ ; CHECK-NEXT: EXP 0, killed renamable $vgpr3, renamable $vgpr4, renamable $vgpr5, killed renamable $vgpr2, 0, 0, 0, implicit $exec
+ ; CHECK-NEXT: S_ENDPGM 0
+ bb.0:
+ liveins: $sgpr0, $vgpr2
+
+ %2:vgpr_32 = COPY $vgpr2
+ S_CMP_EQ_U32 killed $sgpr0, 0, implicit-def $scc
+ S_CBRANCH_SCC0 %bb.2, implicit killed $scc
+
+ bb.1:
+ undef %0.sub0:vreg_96 = V_MOV_B32_e32 0, implicit $exec
+ S_BRANCH %bb.3
+
+ bb.2:
+ S_NOP 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7
+ %1:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
+ %0:vreg_96 = BUFFER_LOAD_FORMAT_XYZ_IDXEN killed %1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s96), addrspace 8)
+
+ bb.3:
+ EXP 0, killed %0.sub0, killed %0.sub1, killed %0.sub2, %2:vgpr_32, 0, 0, 0, implicit $exec
+ S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir b/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir
index a8ed114f8cd783..86b6c5982b4cbd 100644
--- a/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir
+++ b/llvm/test/CodeGen/AMDGPU/issue98474-virtregrewriter-live-out-undef-subregisters.mir
@@ -23,7 +23,7 @@ body: |
; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_ENDPGM 0
; CHECK-NEXT: {{ $}}
@@ -71,7 +71,7 @@ body: |
; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2
; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, renamable $vgpr2, undef renamable $vgpr0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_ENDPGM 0
; CHECK-NEXT: {{ $}}
@@ -119,7 +119,7 @@ body: |
; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2
+ ; CHECK-NEXT: renamable $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr2, implicit-def $vgpr0
; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, renamable $vgpr2, undef renamable $vgpr0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_ENDPGM 0
; CHECK-NEXT: {{ $}}
@@ -168,7 +168,7 @@ body: |
; CHECK-NEXT: S_CBRANCH_SCC0 %bb.2, implicit killed $scc
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: bb.1:
- ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1
; CHECK-NEXT: S_NOP 0, implicit renamable $vgpr0_vgpr1
; CHECK-NEXT: EXP 0, killed renamable $vgpr0, renamable $vgpr1, undef renamable $vgpr0, undef renamable $vgpr0, 0, 0, 0, implicit $exec
; CHECK-NEXT: S_ENDPGM 0
@@ -249,3 +249,115 @@ body: |
S_ENDPGM 0
...
+
+# In bb.2, %0 should be assigned to vgpr0_vgpr1. Make sure the value
+# copied from $vgpr0 into %3 isn't clobbered by the undef phi def for
+# %0.sub1.
+---
+name: assigned_physreg_subregister_interference
+tracksRegLiveness: true
+frameInfo:
+ adjustsStack: true
+machineFunctionInfo:
+ scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3'
+ frameOffsetReg: '$sgpr33'
+ stackPtrOffsetReg: '$sgpr32'
+ wwmReservedRegs:
+ - '$vgpr63'
+body: |
+ ; CHECK-LABEL: name: assigned_physreg_subregister_interference
+ ; CHECK: bb.0:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $vgpr0, $vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+ ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+ ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr30, 0, $vgpr40
+ ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr31, 1, $vgpr40
+ ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr34, 2, $vgpr40
+ ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr35, 3, $vgpr40
+ ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr36, 4, $vgpr40
+ ; CHECK-NEXT: $vgpr40 = SI_SPILL_S32_TO_VGPR $sgpr37, 5, $vgpr40
+ ; CHECK-NEXT: renamable $sgpr34_sgpr35 = S_MOV_B64 $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1:
+ ; CHECK-NEXT: successors: %bb.1(0x40000000), %bb.2(0x40000000)
+ ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $vgpr0_vgpr1:0x000000000000000F
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: renamable $sgpr4 = V_READFIRSTLANE_B32 $vgpr0, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr5 = V_READFIRSTLANE_B32 $vgpr1, implicit $exec
+ ; CHECK-NEXT: renamable $vcc = V_CMP_EQ_U64_e64 $sgpr4_sgpr5, killed $vgpr0_vgpr1, implicit $exec
+ ; CHECK-NEXT: renamable $sgpr36_sgpr37 = S_AND_SAVEEXEC_B64 killed renamable $vcc, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: dead $sgpr30_sgpr31 = noconvergent SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
+ ; CHECK-NEXT: renamable $vgpr1 = COPY $vgpr0, implicit $exec
+ ; CHECK-NEXT: renamable $vgpr0 = V_MOV_B32_e32 123, implicit $exec
+ ; CHECK-NEXT: $exec = S_XOR_B64 $exec, renamable $sgpr36_sgpr37, implicit-def dead $scc
+ ; CHECK-NEXT: S_CBRANCH_EXECNZ %bb.1, implicit $exec
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2:
+ ; CHECK-NEXT: liveins: $vgpr1, $sgpr34_sgpr35
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: $exec = COPY renamable $sgpr34_sgpr35
+ ; CHECK-NEXT: renamable $vgpr0 = V_ADD_U32_e32 1, killed $vgpr1, implicit $exec
+ ; CHECK-NEXT: $sgpr37 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 5
+ ; CHECK-NEXT: $sgpr36 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 4
+ ; CHECK-NEXT: $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 3
+ ; CHECK-NEXT: $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 2
+ ; CHECK-NEXT: $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 1
+ ; CHECK-NEXT: $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr40, 0
+ ; CHECK-NEXT: $sgpr4_sgpr5 = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
+ ; CHECK-NEXT: $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+ ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5
+ ; CHECK-NEXT: SI_RETURN implicit $vgpr0
+ bb.0:
+ liveins: $sgpr30, $sgpr31, $sgpr34, $sgpr35, $sgpr36, $sgpr37, $vgpr0, $vgpr1, $vgpr63
+
+ $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr30, 0, $vgpr63
+ $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr31, 1, $vgpr63
+ $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr34, 2, $vgpr63
+ $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr35, 3, $vgpr63
+ $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr36, 4, $vgpr63
+ $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr37, 5, $vgpr63
+ undef %0.sub0:vreg_64 = COPY $vgpr0
+ %0.sub1:vreg_64 = COPY $vgpr1
+ ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ renamable $sgpr34_sgpr35 = S_MOV_B64 $exec
+
+ bb.1:
+ liveins: $vgpr63, $sgpr34_sgpr35
+
+ renamable $sgpr4 = V_READFIRSTLANE_B32 %0.sub0, implicit $exec
+ renamable $sgpr5 = V_READFIRSTLANE_B32 %0.sub1, implicit $exec
+ renamable $vcc = V_CMP_EQ_U64_e64 $sgpr4_sgpr5, %0, implicit $exec
+ renamable $sgpr36_sgpr37 = S_AND_SAVEEXEC_B64 killed renamable $vcc, implicit-def $exec, implicit-def dead $scc, implicit $exec
+
+ bb.2:
+ liveins: $vgpr63, $sgpr4_sgpr5:0x000000000000000F, $sgpr34_sgpr35, $sgpr36_sgpr37
+
+ dead $sgpr30_sgpr31 = noconvergent SI_CALL killed renamable $sgpr4_sgpr5, 0, csr_amdgpu, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $vgpr0
+ %3:vgpr_32 = COPY $vgpr0
+ undef %0.sub0:vreg_64 = V_MOV_B32_e32 123, implicit $exec
+ $exec = S_XOR_B64_term $exec, killed renamable $sgpr36_sgpr37, implicit-def dead $scc
+ S_CBRANCH_EXECNZ %bb.1, implicit $exec
+
+ bb.3:
+ liveins: $vgpr63, $sgpr34_sgpr35
+
+ $exec = S_MOV_B64_term killed renamable $sgpr34_sgpr35
+
+ bb.4:
+ liveins: $vgpr63
+
+ ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
+ %6:vgpr_32 = V_ADD_U32_e32 1, %3, implicit $exec
+ $vgpr0 = COPY %6
+ $sgpr37 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 5
+ $sgpr36 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 4
+ $sgpr35 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 3
+ $sgpr34 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 2
+ $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 1
+ $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr63, 0
+ SI_RETURN implicit $vgpr0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index 7e76e2bf9e8949..2999ddb8315883 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -172,8 +172,8 @@ define float @sitofp_i128_to_f32(i128 %x) {
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
-; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: ; implicit-def: $vgpr8
+; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: ; implicit-def: $vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr2
; GISEL-NEXT: ; %bb.3: ; %Flow3
@@ -417,8 +417,8 @@ define float @uitofp_i128_to_f32(i128 %x) {
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
-; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: ; implicit-def: $vgpr7
+; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: ; implicit-def: $vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr2
; GISEL-NEXT: ; %bb.3: ; %Flow3
@@ -1263,8 +1263,8 @@ define half @sitofp_i128_to_f16(i128 %x) {
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
-; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: ; implicit-def: $vgpr8
+; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: ; implicit-def: $vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr2
; GISEL-NEXT: ; %bb.3: ; %Flow3
@@ -1510,8 +1510,8 @@ define half @uitofp_i128_to_f16(i128 %x) {
; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
-; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: ; implicit-def: $vgpr7
+; GISEL-NEXT: ; implicit-def: $vgpr0
; GISEL-NEXT: ; implicit-def: $vgpr5
; GISEL-NEXT: ; implicit-def: $vgpr2
; GISEL-NEXT: ; %bb.3: ; %Flow3
diff --git a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
index 6c56dee76142c1..2d5e5a9160fdf7 100644
--- a/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/vni8-across-blocks.ll
@@ -526,6 +526,7 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX906-NEXT: v_cmp_lt_u32_e64 s[0:1], 14, v0
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
+; GFX906-NEXT: ; implicit-def: $vgpr3
; GFX906-NEXT: ; implicit-def: $vgpr13
; GFX906-NEXT: ; implicit-def: $vgpr11
; GFX906-NEXT: ; implicit-def: $vgpr14
@@ -534,7 +535,6 @@ define amdgpu_kernel void @v8i8_phi_const(ptr addrspace(1) %src1, ptr addrspace(
; GFX906-NEXT: ; implicit-def: $vgpr16
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: global_load_dwordx2 v[1:2], v4, s[4:5]
-; GFX906-NEXT: ; implicit-def: $vgpr3
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_lshrrev_b32_e32 v5, 24, v2
; GFX906-NEXT: v_lshrrev_b32_e32 v6, 16, v2
>From b0b1659a7005a7ab4678b3e999048ec3f4103194 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Mon, 21 Oct 2024 16:41:15 -0700
Subject: [PATCH 4/4] More comments
---
llvm/lib/CodeGen/VirtRegMap.cpp | 4 +++-
1 file changed, 3 insertions(+), 1 deletion(-)
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index e38a4633d06174..03419903df7b06 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -541,7 +541,8 @@ bool VirtRegRewriter::subRegLiveThrough(const MachineInstr &MI,
}
/// Compute a lanemask for undef lanes which need to be preserved out of the
-/// defining block for a register assignment.
+/// defining block for a register assignment for a subregister def. \p PhysReg
+/// is assigned to \p LI, which is the main range.
LaneBitmask VirtRegRewriter::liveOutUndefPhiLanesForUndefSubregDef(
const LiveInterval &LI, const MachineBasicBlock &MBB, unsigned SubReg,
MCPhysReg PhysReg, const MachineInstr &MI) const {
@@ -549,6 +550,7 @@ LaneBitmask VirtRegRewriter::liveOutUndefPhiLanesForUndefSubregDef(
LaneBitmask LiveOutUndefLanes;
for (const LiveInterval::SubRange &SR : LI.subranges()) {
+ // Figure out which lanes are undef live into a successor.
LaneBitmask NeedImpDefLanes = UndefMask & SR.LaneMask;
if (NeedImpDefLanes.any() && !LIS->isLiveOutOfMBB(SR, &MBB)) {
for (const MachineBasicBlock *Succ : MBB.successors()) {
More information about the llvm-commits
mailing list