[llvm] Rework i1->i32 zext/anyext translation (PR #114721)
via llvm-commits
llvm-commits at lists.llvm.org
Sun Nov 3 15:10:17 PST 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: None (doraeneko)
<details>
<summary>Changes</summary>
to distinguish uniform and divergent cases (#<!-- -->87938), similarly to sext_inreg handling.
---
Patch is 1.38 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/114721.diff
76 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/SIInstructions.td (+22-6)
- (modified) llvm/test/CodeGen/AMDGPU/add.ll (+77-45)
- (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll (+88-64)
- (modified) llvm/test/CodeGen/AMDGPU/anyext.ll (+11-11)
- (modified) llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll (+56-26)
- (modified) llvm/test/CodeGen/AMDGPU/blender-no-live-segment-at-def-implicit-def.ll (+11-10)
- (modified) llvm/test/CodeGen/AMDGPU/branch-folding-implicit-def-subreg.ll (+12-6)
- (modified) llvm/test/CodeGen/AMDGPU/branch-relaxation.ll (+217-259)
- (modified) llvm/test/CodeGen/AMDGPU/carryout-selection.ll (+242-147)
- (modified) llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll (+142-86)
- (modified) llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll (+16-24)
- (modified) llvm/test/CodeGen/AMDGPU/ctpop16.ll (+26-14)
- (modified) llvm/test/CodeGen/AMDGPU/ctpop64.ll (+26-16)
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-fma-crash.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/dagcombine-lshr-and-cmp.ll (+13-9)
- (modified) llvm/test/CodeGen/AMDGPU/divergence-driven-trunc-to-i1.ll (+15-9)
- (modified) llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll (+60-49)
- (modified) llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll (+25-21)
- (modified) llvm/test/CodeGen/AMDGPU/extract-subvector-16bit.ll (+246-114)
- (modified) llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll (+5-9)
- (modified) llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll (+90-112)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+3914-2790)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (+1086-792)
- (modified) llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll (+16-10)
- (modified) llvm/test/CodeGen/AMDGPU/fp-classify.ll (+171-99)
- (modified) llvm/test/CodeGen/AMDGPU/fp_to_sint.ll (+14-6)
- (modified) llvm/test/CodeGen/AMDGPU/fp_to_uint.ll (+14-6)
- (modified) llvm/test/CodeGen/AMDGPU/fptosi.f16.ll (+16-8)
- (modified) llvm/test/CodeGen/AMDGPU/fptoui.f16.ll (+14-6)
- (modified) llvm/test/CodeGen/AMDGPU/fptrunc.ll (+133-153)
- (modified) llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll (+548-8)
- (modified) llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll (+16-16)
- (modified) llvm/test/CodeGen/AMDGPU/indirect-addressing-si.ll (+260-191)
- (modified) llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll (+36-34)
- (modified) llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll (+23-14)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll (+9-9)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.set.inactive.ll (+10-9)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.set.rounding.ll (+40-116)
- (modified) llvm/test/CodeGen/AMDGPU/long-branch-reserve-register.ll (+20-31)
- (modified) llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll (+26-6)
- (modified) llvm/test/CodeGen/AMDGPU/min.ll (+31-37)
- (modified) llvm/test/CodeGen/AMDGPU/mul.ll (+123-77)
- (modified) llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll (+59-47)
- (modified) llvm/test/CodeGen/AMDGPU/or.ll (+12-4)
- (modified) llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/rsq.f64.ll (+40-20)
- (modified) llvm/test/CodeGen/AMDGPU/saddo.ll (+85-48)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+32-20)
- (added) llvm/test/CodeGen/AMDGPU/select-i32-zext.ll (+42)
- (modified) llvm/test/CodeGen/AMDGPU/select-undef.ll (+18-18)
- (modified) llvm/test/CodeGen/AMDGPU/setcc-opt.ll (+20-36)
- (modified) llvm/test/CodeGen/AMDGPU/setcc64.ll (+2-2)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll (+26-14)
- (modified) llvm/test/CodeGen/AMDGPU/sgpr-copy.ll (+6-7)
- (modified) llvm/test/CodeGen/AMDGPU/si-unify-exit-return-unreachable.ll (+88-86)
- (modified) llvm/test/CodeGen/AMDGPU/sminmax.v2i16.ll (+46-49)
- (modified) llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll (+5-6)
- (modified) llvm/test/CodeGen/AMDGPU/srem.ll (+267-141)
- (modified) llvm/test/CodeGen/AMDGPU/srem64.ll (+41-25)
- (modified) llvm/test/CodeGen/AMDGPU/sub-zext-cc-zext-cc.ll (+6-3)
- (modified) llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll (+316-284)
- (modified) llvm/test/CodeGen/AMDGPU/uaddo.ll (+121-62)
- (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+42-26)
- (modified) llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll (+7-6)
- (modified) llvm/test/CodeGen/AMDGPU/uniform-cfg.ll (+4-4)
- (modified) llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll (+33-27)
- (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+31-19)
- (modified) llvm/test/CodeGen/AMDGPU/usubo.ll (+121-62)
- (modified) llvm/test/CodeGen/AMDGPU/v_cmp_gfx11.ll (+8-8)
- (modified) llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll (+25-19)
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+57-27)
- (modified) llvm/test/CodeGen/AMDGPU/wqm.ll (+48-20)
- (modified) llvm/test/CodeGen/AMDGPU/xor.ll (+24-16)
- (modified) llvm/test/CodeGen/AMDGPU/zero_extend.ll (+4-4)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index c8a46217190a1d..4d0fdc50a37070 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -2343,14 +2343,30 @@ def : GCNPat <
/*src1mod*/(i32 0), /*src1*/(i32 -1), i1:$src0)
>;
-class Ext32Pat <SDNode ext> : GCNPat <
- (i32 (ext i1:$src0)),
- (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
- /*src1mod*/(i32 0), /*src1*/(i32 1), i1:$src0)
+
+class UniformExt32<SDNode ext> : PatFrag<
+ (ops node:$src),
+ (i32 (ext $src)),
+ [{ return !N->isDivergent(); }]>;
+
+class DivergentExt32<SDNode ext> : PatFrag<
+ (ops node:$src),
+ (i32 (ext $src))>;
+
+class UniformExt32Pat<SDNode ext> : GCNPat<
+ (UniformExt32<ext> SCC),
+ (S_CSELECT_B32 (i32 1), (i32 0))
>;
-def : Ext32Pat <zext>;
-def : Ext32Pat <anyext>;
+class DivergentExt32Pat<SDNode ext> : GCNPat<
+ (DivergentExt32<ext> i1:$src),
+ (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+ /*src1mod*/(i32 0), /*src1*/(i32 1), i1:$src)>;
+
+def : UniformExt32Pat<zext>;
+def : UniformExt32Pat<anyext>;
+def : DivergentExt32Pat<zext>;
+def : DivergentExt32Pat<anyext>;
// The multiplication scales from [0,1) to the unsigned integer range,
// rounding down a bit to avoid unwanted overflow.
diff --git a/llvm/test/CodeGen/AMDGPU/add.ll b/llvm/test/CodeGen/AMDGPU/add.ll
index 3c9d43a88a0fda..96d16ae968e1a2 100644
--- a/llvm/test/CodeGen/AMDGPU/add.ll
+++ b/llvm/test/CodeGen/AMDGPU/add.ll
@@ -1156,15 +1156,22 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
; GFX6-NEXT: s_and_b64 vcc, exec, s[10:11]
-; GFX6-NEXT: s_cbranch_vccz .LBB9_4
+; GFX6-NEXT: s_cbranch_vccz .LBB9_2
; GFX6-NEXT: ; %bb.1: ; %else
; GFX6-NEXT: s_add_u32 s4, s4, s6
; GFX6-NEXT: s_addc_u32 s5, s5, s7
-; GFX6-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; GFX6-NEXT: s_cbranch_vccnz .LBB9_3
-; GFX6-NEXT: .LBB9_2: ; %if
+; GFX6-NEXT: s_branch .LBB9_3
+; GFX6-NEXT: .LBB9_2:
+; GFX6-NEXT: s_mov_b64 s[8:9], -1
+; GFX6-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX6-NEXT: .LBB9_3: ; %Flow
+; GFX6-NEXT: s_and_b64 s[6:7], s[8:9], exec
+; GFX6-NEXT: s_cselect_b32 s6, 1, 0
+; GFX6-NEXT: s_cmp_lg_u32 s6, 1
+; GFX6-NEXT: s_cbranch_scc1 .LBB9_5
+; GFX6-NEXT: ; %bb.4: ; %if
; GFX6-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX6-NEXT: .LBB9_3: ; %endif
+; GFX6-NEXT: .LBB9_5: ; %endif
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -1172,9 +1179,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX6-NEXT: s_endpgm
-; GFX6-NEXT: .LBB9_4:
-; GFX6-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX6-NEXT: s_branch .LBB9_2
;
; GFX8-LABEL: add64_in_branch:
; GFX8: ; %bb.0: ; %entry
@@ -1182,15 +1186,22 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: s_mov_b64 s[8:9], 0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX8-NEXT: s_cbranch_scc0 .LBB9_4
+; GFX8-NEXT: s_cbranch_scc0 .LBB9_2
; GFX8-NEXT: ; %bb.1: ; %else
; GFX8-NEXT: s_add_u32 s4, s4, s6
; GFX8-NEXT: s_addc_u32 s5, s5, s7
-; GFX8-NEXT: s_andn2_b64 vcc, exec, s[8:9]
-; GFX8-NEXT: s_cbranch_vccnz .LBB9_3
-; GFX8-NEXT: .LBB9_2: ; %if
+; GFX8-NEXT: s_branch .LBB9_3
+; GFX8-NEXT: .LBB9_2:
+; GFX8-NEXT: s_mov_b64 s[8:9], -1
+; GFX8-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX8-NEXT: .LBB9_3: ; %Flow
+; GFX8-NEXT: s_and_b64 s[6:7], s[8:9], exec
+; GFX8-NEXT: s_cselect_b32 s6, 1, 0
+; GFX8-NEXT: s_cmp_lg_u32 s6, 1
+; GFX8-NEXT: s_cbranch_scc1 .LBB9_5
+; GFX8-NEXT: ; %bb.4: ; %if
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0
-; GFX8-NEXT: .LBB9_3: ; %endif
+; GFX8-NEXT: .LBB9_5: ; %endif
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -1198,9 +1209,6 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX8-NEXT: v_mov_b32_e32 v3, s5
; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
; GFX8-NEXT: s_endpgm
-; GFX8-NEXT: .LBB9_4:
-; GFX8-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX8-NEXT: s_branch .LBB9_2
;
; GFX9-LABEL: add64_in_branch:
; GFX9: ; %bb.0: ; %entry
@@ -1208,90 +1216,114 @@ define amdgpu_kernel void @add64_in_branch(ptr addrspace(1) %out, ptr addrspace(
; GFX9-NEXT: s_mov_b64 s[2:3], 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT: s_cbranch_scc0 .LBB9_4
+; GFX9-NEXT: s_cbranch_scc0 .LBB9_2
; GFX9-NEXT: ; %bb.1: ; %else
; GFX9-NEXT: s_add_u32 s0, s8, s10
; GFX9-NEXT: s_addc_u32 s1, s9, s11
-; GFX9-NEXT: s_andn2_b64 vcc, exec, s[2:3]
-; GFX9-NEXT: s_cbranch_vccnz .LBB9_3
-; GFX9-NEXT: .LBB9_2: ; %if
+; GFX9-NEXT: s_branch .LBB9_3
+; GFX9-NEXT: .LBB9_2:
+; GFX9-NEXT: s_mov_b64 s[2:3], -1
+; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1
+; GFX9-NEXT: .LBB9_3: ; %Flow
+; GFX9-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GFX9-NEXT: s_cselect_b32 s2, 1, 0
+; GFX9-NEXT: s_cmp_lg_u32 s2, 1
+; GFX9-NEXT: s_cbranch_scc1 .LBB9_5
+; GFX9-NEXT: ; %bb.4: ; %if
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX9-NEXT: .LBB9_3: ; %endif
+; GFX9-NEXT: .LBB9_5: ; %endif
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX9-NEXT: s_endpgm
-; GFX9-NEXT: .LBB9_4:
-; GFX9-NEXT: ; implicit-def: $sgpr0_sgpr1
-; GFX9-NEXT: s_branch .LBB9_2
;
; GFX10-LABEL: add64_in_branch:
; GFX10: ; %bb.0: ; %entry
; GFX10-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x24
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_cmp_lg_u64 s[8:9], 0
-; GFX10-NEXT: s_cbranch_scc0 .LBB9_4
+; GFX10-NEXT: s_cbranch_scc0 .LBB9_2
; GFX10-NEXT: ; %bb.1: ; %else
; GFX10-NEXT: s_add_u32 s0, s8, s10
; GFX10-NEXT: s_addc_u32 s1, s9, s11
-; GFX10-NEXT: s_cbranch_execnz .LBB9_3
-; GFX10-NEXT: .LBB9_2: ; %if
+; GFX10-NEXT: s_mov_b32 s2, 0
+; GFX10-NEXT: s_branch .LBB9_3
+; GFX10-NEXT: .LBB9_2:
+; GFX10-NEXT: s_mov_b32 s2, -1
+; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1
+; GFX10-NEXT: .LBB9_3: ; %Flow
+; GFX10-NEXT: s_and_b32 s2, s2, exec_lo
+; GFX10-NEXT: s_cselect_b32 s2, 1, 0
+; GFX10-NEXT: s_cmp_lg_u32 s2, 1
+; GFX10-NEXT: s_cbranch_scc1 .LBB9_5
+; GFX10-NEXT: ; %bb.4: ; %if
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX10-NEXT: .LBB9_3: ; %endif
+; GFX10-NEXT: .LBB9_5: ; %endif
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s0
; GFX10-NEXT: v_mov_b32_e32 v2, 0
; GFX10-NEXT: v_mov_b32_e32 v1, s1
; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5]
; GFX10-NEXT: s_endpgm
-; GFX10-NEXT: .LBB9_4:
-; GFX10-NEXT: ; implicit-def: $sgpr0_sgpr1
-; GFX10-NEXT: s_branch .LBB9_2
;
; GFX11-LABEL: add64_in_branch:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX11-NEXT: s_cbranch_scc0 .LBB9_4
+; GFX11-NEXT: s_cbranch_scc0 .LBB9_2
; GFX11-NEXT: ; %bb.1: ; %else
; GFX11-NEXT: s_add_u32 s4, s4, s6
; GFX11-NEXT: s_addc_u32 s5, s5, s7
-; GFX11-NEXT: s_cbranch_execnz .LBB9_3
-; GFX11-NEXT: .LBB9_2: ; %if
+; GFX11-NEXT: s_mov_b32 s6, 0
+; GFX11-NEXT: s_branch .LBB9_3
+; GFX11-NEXT: .LBB9_2:
+; GFX11-NEXT: s_mov_b32 s6, -1
+; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX11-NEXT: .LBB9_3: ; %Flow
+; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT: s_and_b32 s6, s6, exec_lo
+; GFX11-NEXT: s_cselect_b32 s6, 1, 0
+; GFX11-NEXT: s_cmp_lg_u32 s6, 1
+; GFX11-NEXT: s_cbranch_scc1 .LBB9_5
+; GFX11-NEXT: ; %bb.4: ; %if
; GFX11-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX11-NEXT: .LBB9_3: ; %endif
+; GFX11-NEXT: .LBB9_5: ; %endif
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_mov_b32_e32 v0, s4
; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX11-NEXT: s_endpgm
-; GFX11-NEXT: .LBB9_4:
-; GFX11-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX11-NEXT: s_branch .LBB9_2
;
; GFX12-LABEL: add64_in_branch:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_cmp_lg_u64 s[4:5], 0
-; GFX12-NEXT: s_cbranch_scc0 .LBB9_4
+; GFX12-NEXT: s_cbranch_scc0 .LBB9_2
; GFX12-NEXT: ; %bb.1: ; %else
; GFX12-NEXT: s_add_nc_u64 s[4:5], s[4:5], s[6:7]
-; GFX12-NEXT: s_cbranch_execnz .LBB9_3
-; GFX12-NEXT: .LBB9_2: ; %if
+; GFX12-NEXT: s_mov_b32 s6, 0
+; GFX12-NEXT: s_branch .LBB9_3
+; GFX12-NEXT: .LBB9_2:
+; GFX12-NEXT: s_mov_b32 s6, -1
+; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5
+; GFX12-NEXT: .LBB9_3: ; %Flow
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT: s_and_b32 s6, s6, exec_lo
+; GFX12-NEXT: s_cselect_b32 s6, 1, 0
+; GFX12-NEXT: s_cmp_lg_u32 s6, 1
+; GFX12-NEXT: s_cbranch_scc1 .LBB9_5
+; GFX12-NEXT: ; %bb.4: ; %if
; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x0
-; GFX12-NEXT: .LBB9_3: ; %endif
+; GFX12-NEXT: .LBB9_5: ; %endif
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_mov_b32_e32 v0, s4
; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
-; GFX12-NEXT: .LBB9_4:
-; GFX12-NEXT: ; implicit-def: $sgpr4_sgpr5
-; GFX12-NEXT: s_branch .LBB9_2
entry:
%0 = icmp eq i64 %a, 0
br i1 %0, label %if, label %else
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index 4d26453e1a0d6d..4688c7a6879bd5 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -557,31 +557,31 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_mul_hi_u32 s9, s0, s7
; GFX908-NEXT: s_mul_i32 s0, s0, s7
; GFX908-NEXT: s_add_i32 s1, s9, s1
-; GFX908-NEXT: s_lshl_b64 s[14:15], s[0:1], 5
+; GFX908-NEXT: s_lshl_b64 s[0:1], s[0:1], 5
; GFX908-NEXT: s_branch .LBB3_2
; GFX908-NEXT: .LBB3_1: ; %Flow20
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT: s_andn2_b64 vcc, exec, s[0:1]
-; GFX908-NEXT: s_cbranch_vccz .LBB3_12
+; GFX908-NEXT: s_and_b64 s[14:15], s[14:15], exec
+; GFX908-NEXT: s_cselect_b32 s7, 1, 0
+; GFX908-NEXT: s_cmp_lg_u32 s7, 1
+; GFX908-NEXT: s_cbranch_scc0 .LBB3_14
; GFX908-NEXT: .LBB3_2: ; %bb9
; GFX908-NEXT: ; =>This Loop Header: Depth=1
-; GFX908-NEXT: ; Child Loop BB3_5 Depth 2
+; GFX908-NEXT: ; Child Loop BB3_6 Depth 2
; GFX908-NEXT: s_mov_b64 s[16:17], -1
-; GFX908-NEXT: s_cbranch_scc0 .LBB3_10
+; GFX908-NEXT: s_cbranch_scc0 .LBB3_12
; GFX908-NEXT: ; %bb.3: ; %bb14
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
-; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
; GFX908-NEXT: s_mov_b32 s9, s8
-; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
; GFX908-NEXT: v_mov_b32_e32 v4, s8
-; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6
; GFX908-NEXT: v_mov_b32_e32 v6, s8
; GFX908-NEXT: v_mov_b32_e32 v8, s8
; GFX908-NEXT: v_mov_b32_e32 v5, s9
; GFX908-NEXT: v_mov_b32_e32 v7, s9
; GFX908-NEXT: v_mov_b32_e32 v9, s9
-; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
+; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
+; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[4:5], -1
; GFX908-NEXT: v_mov_b32_e32 v11, v5
; GFX908-NEXT: s_mov_b64 s[18:19], s[10:11]
; GFX908-NEXT: v_mov_b32_e32 v10, v4
@@ -596,18 +596,22 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_add_i32 s9, s20, s9
; GFX908-NEXT: s_mul_i32 s7, s2, s7
; GFX908-NEXT: s_add_i32 s9, s9, s21
-; GFX908-NEXT: s_branch .LBB3_5
+; GFX908-NEXT: s_branch .LBB3_6
; GFX908-NEXT: .LBB3_4: ; %bb58
-; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
+; GFX908-NEXT: ; in Loop: Header=BB3_6 Depth=2
; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX908-NEXT: s_add_u32 s18, s18, s14
+; GFX908-NEXT: s_add_u32 s18, s18, s0
; GFX908-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[2:3]
-; GFX908-NEXT: s_addc_u32 s19, s19, s15
+; GFX908-NEXT: s_addc_u32 s19, s19, s1
; GFX908-NEXT: s_mov_b64 s[20:21], 0
-; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23]
-; GFX908-NEXT: s_cbranch_vccz .LBB3_9
-; GFX908-NEXT: .LBB3_5: ; %bb16
+; GFX908-NEXT: .LBB3_5: ; %Flow18
+; GFX908-NEXT: ; in Loop: Header=BB3_6 Depth=2
+; GFX908-NEXT: s_and_b64 s[22:23], s[22:23], exec
+; GFX908-NEXT: s_cselect_b32 s22, 1, 0
+; GFX908-NEXT: s_cmp_lg_u32 s22, 1
+; GFX908-NEXT: s_cbranch_scc0 .LBB3_11
+; GFX908-NEXT: .LBB3_6: ; %bb16
; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1
; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
; GFX908-NEXT: s_add_u32 s20, s18, s7
@@ -622,11 +626,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: s_waitcnt vmcnt(0)
; GFX908-NEXT: ds_read_b64 v[12:13], v19
; GFX908-NEXT: ds_read_b64 v[14:15], v0
-; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1]
+; GFX908-NEXT: s_and_b64 s[20:21], s[16:17], exec
+; GFX908-NEXT: s_cselect_b32 s20, 1, 0
+; GFX908-NEXT: s_cmp_lg_u32 s20, 1
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
-; GFX908-NEXT: s_cbranch_vccnz .LBB3_7
-; GFX908-NEXT: ; %bb.6: ; %bb51
-; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
+; GFX908-NEXT: s_cbranch_scc1 .LBB3_8
+; GFX908-NEXT: ; %bb.7: ; %bb51
+; GFX908-NEXT: ; in Loop: Header=BB3_6 Depth=2
; GFX908-NEXT: v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; GFX908-NEXT: v_cvt_f32_f16_e32 v21, v21
; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
@@ -648,31 +654,37 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX908-NEXT: v_add_f32_e32 v10, v10, v12
; GFX908-NEXT: v_add_f32_e32 v11, v11, v13
; GFX908-NEXT: s_mov_b64 s[20:21], -1
-; GFX908-NEXT: s_branch .LBB3_4
-; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
-; GFX908-NEXT: s_mov_b64 s[20:21], s[16:17]
-; GFX908-NEXT: s_andn2_b64 vcc, exec, s[20:21]
-; GFX908-NEXT: s_cbranch_vccz .LBB3_4
-; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
+; GFX908-NEXT: s_branch .LBB3_9
+; GFX908-NEXT: .LBB3_8: ; in Loop: Header=BB3_6 Depth=2
+; GFX908-NEXT: s_mov_b64 s[20:21], s[14:15]
+; GFX908-NEXT: .LBB3_9: ; %Flow
+; GFX908-NEXT: ; in Loop: Header=BB3_6 Depth=2
+; GFX908-NEXT: s_and_b64 s[22:23], s[20:21], exec
+; GFX908-NEXT: s_cselect_b32 s22, 1, 0
+; GFX908-NEXT: s_cmp_lg_u32 s22, 1
+; GFX908-NEXT: s_cbranch_scc0 .LBB3_4
+; GFX908-NEXT: ; %bb.10: ; in Loop: Header=BB3_6 Depth=2
+; GFX908-NEXT: s_mov_b64 s[22:23], -1
; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3
; GFX908-NEXT: ; implicit-def: $sgpr18_sgpr19
-; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard
+; GFX908-NEXT: s_branch .LBB3_5
+; GFX908-NEXT: .LBB3_11: ; %loop.exit.guard
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: s_xor_b64 s[16:17], s[20:21], -1
-; GFX908-NEXT: .LBB3_10: ; %Flow19
+; GFX908-NEXT: .LBB3_12: ; %Flow19
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT: s_mov_b64 s[0:1], -1
+; GFX908-NEXT: s_mov_b64 s[14:15], -1
; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17]
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
-; GFX908-NEXT: ; %bb.11: ; %bb12
+; GFX908-NEXT: ; %bb.13: ; %bb12
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX908-NEXT: s_add_u32 s4, s4, s6
; GFX908-NEXT: s_addc_u32 s5, s5, 0
; GFX908-NEXT: s_add_u32 s10, s10, s12
; GFX908-NEXT: s_addc_u32 s11, s11, s13
-; GFX908-NEXT: s_mov_b64 s[0:1], 0
+; GFX908-NEXT: s_mov_b64 s[14:15], 0
; GFX908-NEXT: s_branch .LBB3_1
-; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock
+; GFX908-NEXT: .LBB3_14: ; %DummyReturnBlock
; GFX908-NEXT: s_endpgm
;
; GFX90A-LABEL: introduced_copy_to_sgpr:
@@ -720,28 +732,28 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s7
; GFX90A-NEXT: s_mul_i32 s0, s0, s7
; GFX90A-NEXT: s_add_i32 s1, s9, s1
-; GFX90A-NEXT: s_lshl_b64 s[14:15], s[0:1], 5
+; GFX90A-NEXT: s_lshl_b64 s[0:1], s[0:1], 5
; GFX90A-NEXT: s_branch .LBB3_2
; GFX90A-NEXT: .LBB3_1: ; %Flow20
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[0:1]
-; GFX90A-NEXT: s_cbranch_vccz .LBB3_12
+; GFX90A-NEXT: s_and_b64 s[14:15], s[14:15], exec
+; GFX90A-NEXT: s_cselect_b32 s7, 1, 0
+; GFX90A-NEXT: s_cmp_lg_u32 s7, 1
+; GFX90A-NEXT: s_cbranch_scc0 .LBB3_14
; GFX90A-NEXT: .LBB3_2: ; %bb9
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
-; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2
+; GFX90A-NEXT: ; Child Loop BB3_6 Depth 2
; GFX90A-NEXT: s_mov_b64 s[16:17], -1
-; GFX90A-NEXT: s_cbranch_scc0 .LBB3_10
+; GFX90A-NEXT: s_cbranch_scc0 .LBB3_12
; GFX90A-NEXT: ; %bb.3: ; %bb14
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
; GFX90A-NEXT: s_mov_b32 s9, s8
-; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
+; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[4:5], 0
+; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[4:5], -1
; GFX90A-NEXT: s_mov_b64 s[18:19], s[10:11]
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -755,18 +767,22 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
; GFX90A-NEXT: s_add_i32 s9, s20, s9
; GFX90A-NEXT: s_mul_i32 s7, s2, s7
; GFX90A-NEXT: s_add_i32 s9, s9, s21
-; GFX90A-NEXT: s_branch .LBB3_5
+; GFX90A-NEXT: s_branch .LBB3_6
; GFX90A-NEXT: .LBB3_4: ; %bb58
-; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
+; GFX90A-NEXT: ; in Loop: Header=BB3_6 Depth=2
; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT: s_add_u32 s18, s18, s14
-; GFX90A-NEXT: s_addc_u32 s19, s19, s15
+; GFX90A-NEXT: s_add_u32 s18, s18, s0
+; GFX90A-NEXT: s_addc_u32 s19, s19, s1
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[22:23], -1, v[4:5]
; GFX90A-NEXT: s_mov_b64 s[20:21], 0
-; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23]
-; GFX90A-NEXT: s_cbranch_vccz .LBB3_9
-; GFX90A-NEXT: .LBB3_5: ; %bb16
+; GFX90A-NEXT: .LBB3_5: ; %Flow18
+; GFX90A-NEXT: ; in Loop: Header=BB3_6 Depth=2
+; GFX90A-NEXT: s_and_b64 s[22:23], s[22:23], exec
+; GFX90A-NEXT: s_cselect_b32 s22, 1, 0
+; GFX90A-NEXT: s_cmp_lg_u32 s22, 1
+; GFX90A-NEXT: s_cbranch_scc0 .LBB3_11
+; GFX90A-NEXT: .LBB3_6: ; %bb16
; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1
; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
; GFX90A-NEXT: s_add_u32 s20, s18, s7
@@ -781,12 +797,14 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/114721
More information about the llvm-commits
mailing list