[llvm] AMDGPU: Sign extend immediates for 32-bit subregister extracts (PR #154870)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Aug 21 18:50:37 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
extractSubregFromImm previously would sign extend the 16-bit subregister
extracts, but not the 32-bit. We try to consistently store immediates
as sign extended, since not doing it can result in misreported
isInlineImmediate checks.
---
Patch is 73.90 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/154870.diff
7 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstrInfo.cpp (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll (+48-53)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll (+66-65)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmin.ll (+66-65)
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fsub.ll (+78-87)
- (modified) llvm/test/CodeGen/AMDGPU/fold-imm-copy.mir (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/peephole-fold-imm.mir (+2-2)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index df638bd65bdaa..22569ebba8e4b 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3475,9 +3475,9 @@ std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
case AMDGPU::NoSubRegister:
return Imm;
case AMDGPU::sub0:
- return Lo_32(Imm);
+ return SignExtend64<32>(Imm);
case AMDGPU::sub1:
- return Hi_32(Imm);
+ return SignExtend64<32>(Imm >> 32);
case AMDGPU::lo16:
return SignExtend64<16>(Imm);
case AMDGPU::hi16:
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
index 22dd66118837f..ae5da3ad094c7 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fadd.ll
@@ -503,9 +503,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB2_1: ; %atomicrmw.start
@@ -2827,9 +2828,10 @@ define float @flat_agent_atomic_fadd_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB14_1: ; %atomicrmw.start
@@ -16759,14 +16761,11 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX11-LABEL: flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-NEXT: s_mov_b32 s0, 0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
+; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -16816,9 +16815,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
@@ -16841,9 +16841,10 @@ define <2 x half> @flat_agent_atomic_fadd_ret_v2f16__offset12b_neg__amdgpu_no_fi
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB58_1: ; %atomicrmw.start
@@ -17331,13 +17332,11 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX11-LABEL: flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-NEXT: flat_load_b32 v4, v[3:4]
; GFX11-NEXT: s_mov_b32 s0, 0
+; GFX11-NEXT: flat_load_b32 v4, v[0:1]
; GFX11-NEXT: .LBB61_1: ; %atomicrmw.start
; GFX11-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -17386,9 +17385,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: .LBB61_1: ; %atomicrmw.start
@@ -17411,9 +17411,10 @@ define void @flat_agent_atomic_fadd_noret_v2f16__offset12b_neg__amdgpu_no_fine_g
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: .LBB61_1: ; %atomicrmw.start
@@ -19313,16 +19314,13 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB70_1: ; %atomicrmw.start
@@ -19367,16 +19365,13 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, v0
-; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_co_u32 v4, vcc_lo, 0xfffff800, v3
-; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v5, null, -1, v1, vcc_lo
-; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v3
-; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[4:5]
+; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
+; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v0, v[3:4]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB70_1: ; %atomicrmw.start
@@ -19463,9 +19458,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@@ -19506,9 +19502,10 @@ define <2 x bfloat> @flat_agent_atomic_fadd_ret_v2bf16__offset12b_neg__amdgpu_no
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v1, 16, v2
@@ -20299,15 +20296,13 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-TRUE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-TRUE16: ; %bb.0:
; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-TRUE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-TRUE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-TRUE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 0xffff0000, v2
; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v5, 16, v2
; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0
+; GFX11-TRUE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-TRUE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-TRUE16-NEXT: .p2align 6
; GFX11-TRUE16-NEXT: .LBB73_1: ; %atomicrmw.start
@@ -20352,15 +20347,13 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX11-FAKE16-LABEL: flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_grained_memory:
; GFX11-FAKE16: ; %bb.0:
; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-FAKE16-NEXT: v_add_co_u32 v3, vcc_lo, 0xfffff800, v0
-; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v4, null, -1, v1, vcc_lo
; GFX11-FAKE16-NEXT: v_add_co_u32 v0, vcc_lo, 0xfffff800, v0
+; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-FAKE16-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo
-; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[3:4]
; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v4, 16, v2
; GFX11-FAKE16-NEXT: v_and_b32_e32 v5, 0xffff0000, v2
; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0
+; GFX11-FAKE16-NEXT: flat_load_b32 v3, v[0:1]
; GFX11-FAKE16-NEXT: s_set_inst_prefetch_distance 0x1
; GFX11-FAKE16-NEXT: .p2align 6
; GFX11-FAKE16-NEXT: .LBB73_1: ; %atomicrmw.start
@@ -20446,9 +20439,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[6:7], 0
; GFX90A-NEXT: v_lshlrev_b32_e32 v3, 16, v2
@@ -20489,9 +20483,10 @@ define void @flat_agent_atomic_fadd_noret_v2bf16__offset12b_neg__amdgpu_no_fine_
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[6:7], 0
; GFX908-NEXT: v_lshlrev_b32_e32 v5, 16, v2
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
index 1dc45179c74ce..6218a5c82afcd 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomicrmw-fmax.ll
@@ -382,9 +382,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
@@ -409,9 +410,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__amdgpu_no_fine_grai
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
@@ -836,9 +838,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
@@ -863,9 +866,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__amdgpu_no_fine_gra
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v1, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v2, v2, v2
@@ -1936,9 +1940,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v0, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v1, v2, v2
@@ -1963,9 +1968,10 @@ define float @flat_agent_atomic_fmax_ret_f32__offset12b_neg__ftz__amdgpu_no_fine
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_load_dword v0, v[0:1]
; GFX908-NEXT: s_mov_b64 s[4:5], 0
; GFX908-NEXT: v_max_f32_e32 v1, v2, v2
@@ -2390,9 +2396,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, 0xfffff800, v0
-; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v1, vcc
-; GFX90A-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX90A-NEXT: s_mov_b64 s[4:5], vcc
+; GFX90A-NEXT: v_addc_co_u32_e64 v5, s[4:5], -1, v1, s[4:5]
; GFX90A-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX90A-NEXT: v_mov_b32_e32 v0, v4
; GFX90A-NEXT: flat_load_dword v1, v[0:1]
; GFX90A-NEXT: s_mov_b64 s[4:5], 0
; GFX90A-NEXT: v_max_f32_e32 v2, v2, v2
@@ -2417,9 +2424,10 @@ define void @flat_agent_atomic_fmax_noret_f32__offset12b_neg__ftz__amdgpu_no_fin
; GFX908: ; %bb.0:
; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX908-NEXT: v_add_co_u32_e32 v3, vcc, 0xfffff800, v0
-; GFX908-NEXT: v_addc_co_u32_e32 v4, vcc, -1, v1, vcc
-; GFX908-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX908-NEXT: s_mov_b64 s[4:5], vcc
+; GFX908-NEXT: v_addc_co_u32_e64 v4, s[4:5], -1, v1, s[4:5]
; GFX908-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX908-NEXT: v_mov_b32_e32 v0, v3
; GFX908-NEXT: flat_l...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/154870
More information about the llvm-commits
mailing list