[llvm] [AMDGPU] Fold copies of constant physical registers into their uses (PR #154410)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Sep 17 10:49:29 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Stanislav Mekhanoshin (rampitec)
<details>
<summary>Changes</summary>
Co-authored-by: Jay Foad <Jay.Foad@<!-- -->amd.com>
---
Patch is 131.73 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154410.diff
5 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+11-4)
- (modified) llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll (+13-14)
- (modified) llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll (+376-378)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll (+69)
- (modified) llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll (+6-7)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 736744569a3bd..38331b614bceb 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -710,7 +710,8 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
// Verify the register is compatible with the operand.
if (const TargetRegisterClass *OpRC =
TII->getRegClass(MI->getDesc(), Fold.UseOpNo, TRI)) {
- const TargetRegisterClass *NewRC = MRI->getRegClass(New->getReg());
+ const TargetRegisterClass *NewRC =
+ TRI->getRegClassForReg(*MRI, New->getReg());
const TargetRegisterClass *ConstrainRC =
TRI->findCommonRegClass(OpRC, Old.getSubReg(), NewRC, New->getSubReg());
if (!ConstrainRC)
@@ -727,8 +728,12 @@ bool SIFoldOperandsImpl::updateOperand(FoldCandidate &Fold) const {
// 16-bit SGPRs instead of 32-bit ones.
if (Old.getSubReg() == AMDGPU::lo16 && TRI->isSGPRReg(*MRI, New->getReg()))
Old.setSubReg(AMDGPU::NoSubRegister);
- Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
- Old.setIsUndef(New->isUndef());
+ if (New->getReg().isPhysical()) {
+ Old.substPhysReg(New->getReg(), *TRI);
+ } else {
+ Old.substVirtReg(New->getReg(), New->getSubReg(), *TRI);
+ Old.setIsUndef(New->isUndef());
+ }
return true;
}
@@ -1997,7 +2002,9 @@ bool SIFoldOperandsImpl::tryFoldFoldableCopy(
if (!FoldingImm && !OpToFold.isReg())
return false;
- if (OpToFold.isReg() && !OpToFold.getReg().isVirtual())
+ // Fold virtual registers and constant physical registers.
+ if (OpToFold.isReg() && OpToFold.getReg().isPhysical() &&
+ !TRI->isConstantPhysReg(OpToFold.getReg()))
return false;
// Prevent folding operands backwards in the function. For example,
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
index 153898560fc31..aac499f2fc602 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
@@ -27,20 +27,20 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
;
; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, -1
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, -1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
-; GFX1250-GISEL-NEXT: s_cselect_b32 s0, 1, 0
-; GFX1250-GISEL-NEXT: s_and_b32 s0, 1, s0
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-GISEL-NEXT: s_cselect_b32 s1, 1, 0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_and_b32 s1, 1, s1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
-; GFX1250-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX1250-GISEL-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX1250-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_cndmask_b32 v1, 0, v1
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo
; GFX1250-GISEL-NEXT: flat_store_b32 v[0:1], v2 scope:SCOPE_SYS
@@ -69,14 +69,13 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa
;
; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast_nonnull:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x24
-; GFX1250-GISEL-NEXT: s_mov_b64 s[0:1], src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: s_load_b32 s0, s[4:5], 0x24
; GFX1250-GISEL-NEXT: v_mbcnt_lo_u32_b32 v2, -1, 0
-; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v2, 20, v2
; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s2, v0
+; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, s0, v0
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, v2, v1, vcc_lo
; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 082050877e0bb..605026614c614 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -586,14 +586,14 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
;
; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -618,10 +618,10 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB10_2
; GFX1250-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
@@ -770,9 +770,9 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
;
; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -780,7 +780,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -805,10 +805,10 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB11_2
; GFX1250-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: s_clause 0x1
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v2, off
@@ -953,15 +953,15 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
;
; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB12_3
@@ -982,10 +982,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB12_2
; GFX1250-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -1104,18 +1104,18 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
;
; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v1, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, 0xffffff80, v1
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB13_3
@@ -1136,10 +1136,10 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB13_2
; GFX1250-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[2:3]
-; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s0, v2
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v2, v0
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off
; GFX1250-GISEL-NEXT: s_endpgm
@@ -1445,14 +1445,14 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
;
; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -1477,10 +1477,10 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB18_2
; GFX1250-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -1633,9 +1633,9 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
;
; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v8, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, v0, v3
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -1643,7 +1643,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0xffffff80, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, -1, v1, vcc_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_bitop2_b32 v0, s0, v7 bitop3:0x14
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v0, v7, v8
; GFX1250-GISEL-NEXT: v_cmp_le_u32_e32 vcc_lo, 0x4000000, v0
; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
; GFX1250-GISEL-NEXT: s_and_saveexec_b32 s0, vcc_lo
@@ -1668,10 +1668,10 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB19_2
; GFX1250-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s1, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u64_e32 vcc_lo, 0, v[6:7]
-; GFX1250-GISEL-NEXT: v_subrev_nc_u32_e32 v0, s1, v6
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_sub_nc_u32_e32 v0, v6, v0
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_load_b64 v[0:1], v6, off
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
@@ -1823,15 +1823,15 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
;
; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn:
; GFX1250-GISEL: ; %bb.0:
-; GFX1250-GISEL-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v6, src_flat_scratch_base_hi :: v_dual_mov_b32 v4, v1
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v5, v2
; GFX1250-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX1250-GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, v0
; GFX1250-GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
-; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, s0, v3
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, exec_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v3, v6
; GFX1250-GISEL-NEXT: v_cmpx_le_u32_e32 0x4000000, v1
; GFX1250-GISEL-NEXT: s_xor_b32 s0, exec_lo, s0
; GFX1250-GISEL-NEXT: s_cbranch_execnz .LBB20_3
@@ -1852,10 +1852,10 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: s_and_not1_saveexec_b32 s0, s0
; GFX1250-GISEL-NEXT: s_cbranch_execz .LBB20_2
; GFX1250-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private
-; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v0, src_flat_scratch_base_lo
; GFX1250-GISEL-NEXT: v_cmp_ne_u6...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/154410
More information about the llvm-commits
mailing list