[llvm] AMDGPU: Relax legal register operand constraint (PR #157989)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 10 21:18:43 PDT 2025
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/157989
Find a common subclass instead of directly checking for a subclass
relationship. This fixes folding logic for unaligned register defs
into aligned use contexts. e.g., a vreg_64 def into an av_64_align2
use should be able to find the common subclass vreg_align2. This
avoids regressions in future patches.
Checking the subclass was also redundant on the subregister path;
getMatchingSuperRegClass is sufficient.
>From e8eeda9f675746825789531249303f605e749135 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 11 Sep 2025 12:44:23 +0900
Subject: [PATCH] AMDGPU: Relax legal register operand constraint
Find a common subclass instead of directly checking for a subclass
relationship. This fixes folding logic for unaligned register defs
into aligned use contexts. e.g., a vreg_64 def into an av_64_align2
use should be able to find the common subclass vreg_align2. This
avoids regressions in future patches.
Checking the subclass was also redundant on the subregister path;
getMatchingSuperRegClass is sufficient.
---
llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 8 +--
.../AMDGPU/GlobalISel/vni8-across-blocks.ll | 22 +++----
.../test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll | 63 ++++++++++---------
llvm/test/CodeGen/AMDGPU/fold-sgpr-copy.mir | 10 ++-
4 files changed, 51 insertions(+), 52 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 398c99b3bd127..cbb68fa85ca80 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -6127,12 +6127,10 @@ bool SIInstrInfo::isLegalRegOperand(const MachineRegisterInfo &MRI,
const TargetRegisterClass *SuperRC = RI.getLargestLegalSuperClass(RC, *MF);
if (!SuperRC)
return false;
-
- DRC = RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg());
- if (!DRC)
- return false;
+ return RI.getMatchingSuperRegClass(SuperRC, DRC, MO.getSubReg()) != nullptr;
}
- return RC->hasSuperClassEq(DRC);
+
+ return RI.getCommonSubClass(DRC, RC) != nullptr;
}
bool SIInstrInfo::isLegalRegOperand(const MachineInstr &MI, unsigned OpIdx,
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
index 9c2fabce4bcde..b33b8a7d8cd72 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/vni8-across-blocks.ll
@@ -7,33 +7,33 @@ define amdgpu_kernel void @v3i8_liveout(ptr addrspace(1) %src1, ptr addrspace(1)
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24
; GFX906-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34
; GFX906-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; GFX906-NEXT: v_mov_b32_e32 v4, 8
+; GFX906-NEXT: v_mov_b32_e32 v3, 8
; GFX906-NEXT: v_mov_b32_e32 v5, 16
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
-; GFX906-NEXT: global_load_dword v3, v2, s[0:1]
+; GFX906-NEXT: global_load_dword v4, v2, s[0:1]
; GFX906-NEXT: v_mov_b32_e32 v1, 0xff
; GFX906-NEXT: v_cmp_gt_u32_e32 vcc, 15, v0
; GFX906-NEXT: s_waitcnt vmcnt(0)
-; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v3
-; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT: v_or3_b32 v3, v6, v7, v3
+; GFX906-NEXT: v_and_b32_e32 v6, 0xff, v4
+; GFX906-NEXT: v_lshlrev_b32_sdwa v7, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX906-NEXT: v_lshlrev_b32_sdwa v4, v5, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX906-NEXT: v_or3_b32 v4, v6, v7, v4
; GFX906-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX906-NEXT: s_cbranch_execz .LBB0_2
; GFX906-NEXT: ; %bb.1: ; %bb.1
; GFX906-NEXT: global_load_dword v0, v2, s[2:3]
; GFX906-NEXT: s_waitcnt vmcnt(0)
; GFX906-NEXT: v_and_b32_e32 v2, 0xff, v0
-; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX906-NEXT: v_lshlrev_b32_sdwa v3, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
; GFX906-NEXT: v_lshlrev_b32_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX906-NEXT: v_or3_b32 v3, v2, v3, v0
+; GFX906-NEXT: v_or3_b32 v4, v2, v3, v0
; GFX906-NEXT: .LBB0_2: ; %bb.2
; GFX906-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v3
+; GFX906-NEXT: v_lshrrev_b32_e32 v0, 8, v4
; GFX906-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX906-NEXT: v_lshlrev_b16_e32 v0, 8, v0
-; GFX906-NEXT: v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX906-NEXT: v_and_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX906-NEXT: v_or_b32_sdwa v0, v4, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX906-NEXT: v_and_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; GFX906-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX906-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX906-NEXT: v_lshl_or_b32 v0, v1, 16, v0
diff --git a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
index 7b33374453010..6b6eb43baf856 100644
--- a/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
+++ b/llvm/test/CodeGen/AMDGPU/a-v-flat-atomicrmw.ll
@@ -969,37 +969,38 @@ define void @flat_atomic_xchg_i64_ret_av_av(ptr %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[0:1]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB14_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] sc0 sc1
+; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB14_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB14_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off
; GFX950-NEXT: .LBB14_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[2:3]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -1058,37 +1059,38 @@ define void @flat_atomic_xchg_i64_ret_av_v(ptr %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[0:1]
+; GFX950-NEXT: ; def v[4:5]
; GFX950-NEXT: ;;#ASMEND
-; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
; GFX950-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB15_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_swap_x2 v[2:3], v[4:5], v[0:1] sc0 sc1
+; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
+; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX950-NEXT: ; implicit-def: $vgpr4_vgpr5
; GFX950-NEXT: .LBB15_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB15_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
-; GFX950-NEXT: scratch_load_dwordx2 v[2:3], v4, off
+; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX950-NEXT: scratch_load_dwordx2 v[0:1], v2, off
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-NEXT: scratch_store_dwordx2 v2, v[4:5], off
; GFX950-NEXT: .LBB15_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; use v[2:3]
+; GFX950-NEXT: ; use v[0:1]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: s_waitcnt vmcnt(0)
; GFX950-NEXT: s_setpc_b64 s[30:31]
@@ -1149,11 +1151,11 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 {
; GFX950: ; %bb.0:
; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX950-NEXT: s_mov_b64 s[0:1], 0x50
-; GFX950-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
; GFX950-NEXT: s_mov_b64 s[0:1], src_private_base
-; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
; GFX950-NEXT: ;;#ASMSTART
-; GFX950-NEXT: ; def v[0:1]
+; GFX950-NEXT: ; def v[2:3]
; GFX950-NEXT: ;;#ASMEND
; GFX950-NEXT: ; implicit-def: $agpr0_agpr1
; GFX950-NEXT: s_and_saveexec_b64 s[0:1], vcc
@@ -1161,22 +1163,23 @@ define void @flat_atomic_xchg_i64_ret_av_a(ptr %ptr) #0 {
; GFX950-NEXT: s_cbranch_execz .LBB16_2
; GFX950-NEXT: ; %bb.1: ; %atomicrmw.global
; GFX950-NEXT: buffer_wbl2 sc0 sc1
-; GFX950-NEXT: flat_atomic_swap_x2 v[2:3], v[2:3], v[0:1] sc0 sc1
+; GFX950-NEXT: flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] sc0 sc1
; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX950-NEXT: buffer_inv sc0 sc1
-; GFX950-NEXT: v_accvgpr_write_b32 a0, v2
-; GFX950-NEXT: v_accvgpr_write_b32 a1, v3
; GFX950-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX950-NEXT: v_accvgpr_write_b32 a1, v1
+; GFX950-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX950-NEXT: .LBB16_2: ; %Flow
; GFX950-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
; GFX950-NEXT: s_cbranch_execz .LBB16_4
; GFX950-NEXT: ; %bb.3: ; %atomicrmw.private
-; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
; GFX950-NEXT: s_nop 1
-; GFX950-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v2, off
+; GFX950-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX950-NEXT: scratch_load_dwordx2 a[0:1], v0, off
; GFX950-NEXT: s_nop 0
-; GFX950-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-NEXT: scratch_store_dwordx2 v0, v[2:3], off
; GFX950-NEXT: .LBB16_4: ; %atomicrmw.phi
; GFX950-NEXT: s_or_b64 exec, exec, s[0:1]
; GFX950-NEXT: s_waitcnt vmcnt(1)
diff --git a/llvm/test/CodeGen/AMDGPU/fold-sgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/fold-sgpr-copy.mir
index 103c3e3eb8bc6..e1295d4a09563 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-sgpr-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-sgpr-copy.mir
@@ -17,9 +17,8 @@ body: |
...
# GCN-LABEL: name: fold_sgpr_to_sgpr_copy_subreg
-# GCN: %0:sreg_64 = IMPLICIT_DEF
-# GCN-NEXT: %2:sgpr_32 = COPY %0.sub0
-# GCN-NEXT: S_STORE_DWORD_IMM %2, undef $sgpr10_sgpr11, 0, 0
+# GCN: %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: S_STORE_DWORD_IMM %0.sub0, undef $sgpr10_sgpr11, 0, 0
name: fold_sgpr_to_sgpr_copy_subreg
body: |
@@ -32,9 +31,8 @@ body: |
...
# GCN-LABEL: name: fold_sgpr_to_sgpr_copy_subreg2
-# GCN: %0:sreg_64 = IMPLICIT_DEF
-# GCN-NEXT: %3:sreg_32_xm0_xexec = COPY %0.sub0
-# GCN-NEXT: S_STORE_DWORD_IMM %3, undef $sgpr10_sgpr11, 0, 0
+# GCN: %0:sreg_64_xexec = IMPLICIT_DEF
+# GCN-NEXT: S_STORE_DWORD_IMM %0.sub0, undef $sgpr10_sgpr11, 0, 0
name: fold_sgpr_to_sgpr_copy_subreg2
body: |
More information about the llvm-commits
mailing list