[llvm-branch-commits] [llvm] AMDGPU: Migrate more tests away from undef (PR #131314)

via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Fri Mar 14 04:31:23 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-llvm-globalisel

Author: Matt Arsenault (arsenm)

<details>
<summary>Changes</summary>

andorbitset.ll is interesting since it directly depends on the
difference between poison and undef. Not sure it's useful to keep
the version using poison, I assume none of this code makes it to
codegen.

si-spill-cf.ll was also a nasty case, which I doubt has been reproducing
its original issue for a very long time. I had to reclaim an older version,
replace some of the poison uses, and run simplify-cfg. There's a very
slight change in the final CFG with this, but final the output is approximately
the same as it used to be.

---

Patch is 119.89 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131314.diff


26 Files Affected:

- (modified) llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll (+174-169) 
- (modified) llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/andorbitset.ll (+31-4) 
- (modified) llvm/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll (+4-2) 
- (modified) llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll (+142-84) 
- (modified) llvm/test/CodeGen/AMDGPU/fold-fabs.ll (+2-1) 
- (modified) llvm/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll (+19-3) 
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-addrspace.mir (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/merge-load-store-vreg.mir (+1-1) 
- (modified) llvm/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll (+2-2) 
- (modified) llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/schedule-amdgpu-trackers.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll (+3-3) 
- (modified) llvm/test/CodeGen/AMDGPU/si-spill-cf.ll (+128-190) 
- (modified) llvm/test/CodeGen/AMDGPU/skip-if-dead.ll (+15-6) 
- (modified) llvm/test/CodeGen/AMDGPU/soft-clause-exceeds-register-budget.ll (+4-4) 
- (modified) llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll (+22-20) 
- (modified) llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll (+7-4) 
- (modified) llvm/test/CodeGen/AMDGPU/uniform-cfg.ll (+11-1) 
- (modified) llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll (+2-1) 
- (modified) llvm/test/CodeGen/AMDGPU/wave32.ll (+2-2) 
- (modified) llvm/test/CodeGen/MIR/AMDGPU/custom-pseudo-source-values.ll (+1-1) 
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll (+1-1) 
- (modified) llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll (+1-1) 
- (modified) llvm/test/CodeGen/MIR/AMDGPU/mircanon-memoperands.mir (+6-6) 
- (modified) llvm/test/CodeGen/MIR/AMDGPU/syncscopes.mir (+6-6) 


``````````diff
diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
index a4eab62f501ce..3160e38df5e3f 100644
--- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll
@@ -513,115 +513,117 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-LABEL: introduced_copy_to_sgpr:
 ; GFX908:       ; %bb.0: ; %bb
 ; GFX908-NEXT:    global_load_ushort v16, v[0:1], off glc
-; GFX908-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX908-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX908-NEXT:    s_load_dword s7, s[8:9], 0x18
-; GFX908-NEXT:    s_mov_b32 s6, 0
-; GFX908-NEXT:    s_mov_b32 s9, s6
+; GFX908-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX908-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x10
+; GFX908-NEXT:    s_load_dword s0, s[8:9], 0x18
+; GFX908-NEXT:    s_mov_b32 s12, 0
+; GFX908-NEXT:    s_mov_b32 s9, s12
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX908-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX908-NEXT:    s_sub_i32 s8, 0, s3
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v17, s7
+; GFX908-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX908-NEXT:    s_sub_i32 s1, 0, s7
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v17, s0
 ; GFX908-NEXT:    v_mov_b32_e32 v19, 0
 ; GFX908-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX908-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX908-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX908-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX908-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX908-NEXT:    s_mul_i32 s8, s8, s10
-; GFX908-NEXT:    s_mul_hi_u32 s8, s10, s8
-; GFX908-NEXT:    s_add_i32 s10, s10, s8
-; GFX908-NEXT:    s_mul_hi_u32 s8, s2, s10
-; GFX908-NEXT:    s_mul_i32 s10, s8, s3
-; GFX908-NEXT:    s_sub_i32 s2, s2, s10
-; GFX908-NEXT:    s_add_i32 s11, s8, 1
-; GFX908-NEXT:    s_sub_i32 s10, s2, s3
-; GFX908-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX908-NEXT:    s_cselect_b32 s8, s11, s8
-; GFX908-NEXT:    s_cselect_b32 s2, s10, s2
-; GFX908-NEXT:    s_add_i32 s10, s8, 1
-; GFX908-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX908-NEXT:    s_cselect_b32 s8, s10, s8
-; GFX908-NEXT:    s_lshr_b32 s7, s7, 16
-; GFX908-NEXT:    v_cvt_f32_f16_e32 v18, s7
-; GFX908-NEXT:    s_lshl_b64 s[2:3], s[0:1], 5
-; GFX908-NEXT:    s_lshl_b64 s[12:13], s[8:9], 5
-; GFX908-NEXT:    s_lshl_b64 s[10:11], s[4:5], 5
-; GFX908-NEXT:    s_or_b32 s10, s10, 28
+; GFX908-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX908-NEXT:    s_mul_i32 s1, s1, s2
+; GFX908-NEXT:    s_mul_hi_u32 s1, s2, s1
+; GFX908-NEXT:    s_add_i32 s2, s2, s1
+; GFX908-NEXT:    s_mul_hi_u32 s1, s6, s2
+; GFX908-NEXT:    s_mul_i32 s2, s1, s7
+; GFX908-NEXT:    s_sub_i32 s2, s6, s2
+; GFX908-NEXT:    s_add_i32 s3, s1, 1
+; GFX908-NEXT:    s_sub_i32 s6, s2, s7
+; GFX908-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX908-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX908-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX908-NEXT:    s_add_i32 s3, s1, 1
+; GFX908-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX908-NEXT:    s_cselect_b32 s8, s3, s1
+; GFX908-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX908-NEXT:    v_cvt_f32_f16_e32 v18, s2
+; GFX908-NEXT:    s_lshl_b64 s[6:7], s[4:5], 5
+; GFX908-NEXT:    s_lshl_b64 s[14:15], s[10:11], 5
+; GFX908-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX908-NEXT:    s_or_b32 s14, s14, 28
+; GFX908-NEXT:    s_lshl_b64 s[16:17], s[8:9], 5
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_readfirstlane_b32 s7, v16
-; GFX908-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX908-NEXT:    s_mul_i32 s1, s1, s7
-; GFX908-NEXT:    s_mul_hi_u32 s9, s0, s7
-; GFX908-NEXT:    s_mul_i32 s0, s0, s7
-; GFX908-NEXT:    s_add_i32 s1, s9, s1
-; GFX908-NEXT:    s_lshl_b64 s[14:15], s[0:1], 5
+; GFX908-NEXT:    v_readfirstlane_b32 s2, v16
+; GFX908-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX908-NEXT:    s_mul_i32 s3, s5, s2
+; GFX908-NEXT:    s_mul_hi_u32 s5, s4, s2
+; GFX908-NEXT:    s_mul_i32 s2, s4, s2
+; GFX908-NEXT:    s_add_i32 s3, s5, s3
+; GFX908-NEXT:    s_lshl_b64 s[4:5], s[2:3], 5
 ; GFX908-NEXT:    s_branch .LBB3_2
 ; GFX908-NEXT:  .LBB3_1: ; %Flow20
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_12
 ; GFX908-NEXT:  .LBB3_2: ; %bb9
 ; GFX908-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX908-NEXT:    ; Child Loop BB3_5 Depth 2
-; GFX908-NEXT:    s_mov_b64 s[16:17], -1
-; GFX908-NEXT:    s_cbranch_scc0 .LBB3_10
+; GFX908-NEXT:    s_mov_b64 s[18:19], -1
+; GFX908-NEXT:    s_mov_b64 vcc, s[0:1]
+; GFX908-NEXT:    s_cbranch_vccz .LBB3_10
 ; GFX908-NEXT:  ; %bb.3: ; %bb14
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX908-NEXT:    global_load_dwordx2 v[2:3], v[0:1], off
-; GFX908-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
-; GFX908-NEXT:    s_mov_b32 s7, s6
-; GFX908-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[0:1]
-; GFX908-NEXT:    v_mov_b32_e32 v4, s6
-; GFX908-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v6
-; GFX908-NEXT:    v_mov_b32_e32 v6, s6
-; GFX908-NEXT:    v_mov_b32_e32 v9, s7
-; GFX908-NEXT:    v_mov_b32_e32 v5, s7
-; GFX908-NEXT:    v_mov_b32_e32 v7, s7
-; GFX908-NEXT:    v_mov_b32_e32 v8, s6
-; GFX908-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
+; GFX908-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
+; GFX908-NEXT:    s_mov_b32 s13, s12
+; GFX908-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[2:3]
+; GFX908-NEXT:    v_mov_b32_e32 v4, s12
+; GFX908-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, v6
+; GFX908-NEXT:    v_mov_b32_e32 v6, s12
+; GFX908-NEXT:    v_mov_b32_e32 v8, s12
+; GFX908-NEXT:    v_mov_b32_e32 v5, s13
+; GFX908-NEXT:    v_mov_b32_e32 v7, s13
+; GFX908-NEXT:    v_mov_b32_e32 v9, s13
+; GFX908-NEXT:    v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
 ; GFX908-NEXT:    v_mov_b32_e32 v11, v5
-; GFX908-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; GFX908-NEXT:    s_mov_b64 s[20:21], s[14:15]
 ; GFX908-NEXT:    v_mov_b32_e32 v10, v4
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    v_readfirstlane_b32 s7, v2
-; GFX908-NEXT:    v_readfirstlane_b32 s9, v3
-; GFX908-NEXT:    s_add_u32 s7, s7, 1
-; GFX908-NEXT:    s_addc_u32 s9, s9, 0
-; GFX908-NEXT:    s_mul_hi_u32 s20, s2, s7
-; GFX908-NEXT:    s_mul_i32 s9, s2, s9
-; GFX908-NEXT:    s_mul_i32 s21, s3, s7
-; GFX908-NEXT:    s_add_i32 s9, s20, s9
-; GFX908-NEXT:    s_mul_i32 s7, s2, s7
-; GFX908-NEXT:    s_add_i32 s9, s9, s21
+; GFX908-NEXT:    v_readfirstlane_b32 s9, v2
+; GFX908-NEXT:    v_readfirstlane_b32 s13, v3
+; GFX908-NEXT:    s_add_u32 s9, s9, 1
+; GFX908-NEXT:    s_addc_u32 s13, s13, 0
+; GFX908-NEXT:    s_mul_hi_u32 s22, s6, s9
+; GFX908-NEXT:    s_mul_i32 s13, s6, s13
+; GFX908-NEXT:    s_mul_i32 s23, s7, s9
+; GFX908-NEXT:    s_add_i32 s13, s22, s13
+; GFX908-NEXT:    s_mul_i32 s9, s6, s9
+; GFX908-NEXT:    s_add_i32 s13, s13, s23
 ; GFX908-NEXT:    s_branch .LBB3_5
 ; GFX908-NEXT:  .LBB3_4: ; %bb58
 ; GFX908-NEXT:    ; in Loop: Header=BB3_5 Depth=2
 ; GFX908-NEXT:    v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX908-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
-; GFX908-NEXT:    s_add_u32 s18, s18, s14
-; GFX908-NEXT:    v_cmp_lt_i64_e64 s[22:23], -1, v[2:3]
-; GFX908-NEXT:    s_addc_u32 s19, s19, s15
-; GFX908-NEXT:    s_mov_b64 s[20:21], 0
-; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
+; GFX908-NEXT:    s_add_u32 s20, s20, s4
+; GFX908-NEXT:    v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
+; GFX908-NEXT:    s_addc_u32 s21, s21, s5
+; GFX908-NEXT:    s_mov_b64 s[22:23], 0
+; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[24:25]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_9
 ; GFX908-NEXT:  .LBB3_5: ; %bb16
 ; GFX908-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX908-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX908-NEXT:    s_add_u32 s20, s18, s7
-; GFX908-NEXT:    s_addc_u32 s21, s19, s9
-; GFX908-NEXT:    global_load_dword v21, v19, s[20:21] offset:-12 glc
+; GFX908-NEXT:    s_add_u32 s22, s20, s9
+; GFX908-NEXT:    s_addc_u32 s23, s21, s13
+; GFX908-NEXT:    global_load_dword v21, v19, s[22:23] offset:-12 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v20, v19, s[20:21] offset:-8 glc
+; GFX908-NEXT:    global_load_dword v20, v19, s[22:23] offset:-8 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v12, v19, s[20:21] offset:-4 glc
+; GFX908-NEXT:    global_load_dword v12, v19, s[22:23] offset:-4 glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
-; GFX908-NEXT:    global_load_dword v12, v19, s[20:21] glc
+; GFX908-NEXT:    global_load_dword v12, v19, s[22:23] glc
 ; GFX908-NEXT:    s_waitcnt vmcnt(0)
 ; GFX908-NEXT:    ds_read_b64 v[12:13], v19
 ; GFX908-NEXT:    ds_read_b64 v[14:15], v0
-; GFX908-NEXT:    s_and_b64 vcc, exec, s[0:1]
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[2:3]
 ; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX908-NEXT:    s_cbranch_vccnz .LBB3_7
 ; GFX908-NEXT:  ; %bb.6: ; %bb51
@@ -648,28 +650,28 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX908-NEXT:    v_add_f32_e32 v11, v11, v13
 ; GFX908-NEXT:    s_branch .LBB3_4
 ; GFX908-NEXT:  .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
-; GFX908-NEXT:    s_mov_b64 s[20:21], s[16:17]
-; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
+; GFX908-NEXT:    s_mov_b64 s[22:23], s[18:19]
+; GFX908-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX908-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_mov_b64 s[20:21], -1
+; GFX908-NEXT:    s_mov_b64 s[22:23], -1
 ; GFX908-NEXT:    ; implicit-def: $vgpr2_vgpr3
-; GFX908-NEXT:    ; implicit-def: $sgpr18_sgpr19
+; GFX908-NEXT:    ; implicit-def: $sgpr20_sgpr21
 ; GFX908-NEXT:  .LBB3_9: ; %loop.exit.guard
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_xor_b64 s[16:17], s[20:21], -1
+; GFX908-NEXT:    s_xor_b64 s[18:19], s[22:23], -1
 ; GFX908-NEXT:  .LBB3_10: ; %Flow19
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_mov_b64 s[0:1], -1
-; GFX908-NEXT:    s_and_b64 vcc, exec, s[16:17]
+; GFX908-NEXT:    s_mov_b64 s[2:3], -1
+; GFX908-NEXT:    s_and_b64 vcc, exec, s[18:19]
 ; GFX908-NEXT:    s_cbranch_vccz .LBB3_1
 ; GFX908-NEXT:  ; %bb.11: ; %bb12
 ; GFX908-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX908-NEXT:    s_add_u32 s4, s4, s8
-; GFX908-NEXT:    s_addc_u32 s5, s5, 0
-; GFX908-NEXT:    s_add_u32 s10, s10, s12
-; GFX908-NEXT:    s_addc_u32 s11, s11, s13
-; GFX908-NEXT:    s_mov_b64 s[0:1], 0
+; GFX908-NEXT:    s_add_u32 s10, s10, s8
+; GFX908-NEXT:    s_addc_u32 s11, s11, 0
+; GFX908-NEXT:    s_add_u32 s14, s14, s16
+; GFX908-NEXT:    s_addc_u32 s15, s15, s17
+; GFX908-NEXT:    s_mov_b64 s[2:3], 0
 ; GFX908-NEXT:    s_branch .LBB3_1
 ; GFX908-NEXT:  .LBB3_12: ; %DummyReturnBlock
 ; GFX908-NEXT:    s_endpgm
@@ -677,111 +679,113 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-LABEL: introduced_copy_to_sgpr:
 ; GFX90A:       ; %bb.0: ; %bb
 ; GFX90A-NEXT:    global_load_ushort v18, v[0:1], off glc
-; GFX90A-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX90A-NEXT:    s_load_dwordx2 s[4:5], s[8:9], 0x10
-; GFX90A-NEXT:    s_load_dword s7, s[8:9], 0x18
-; GFX90A-NEXT:    s_mov_b32 s6, 0
-; GFX90A-NEXT:    s_mov_b32 s9, s6
+; GFX90A-NEXT:    s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX90A-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x10
+; GFX90A-NEXT:    s_load_dword s0, s[8:9], 0x18
+; GFX90A-NEXT:    s_mov_b32 s12, 0
+; GFX90A-NEXT:    s_mov_b32 s9, s12
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s3
-; GFX90A-NEXT:    s_sub_i32 s8, 0, s3
+; GFX90A-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX90A-NEXT:    s_sub_i32 s1, 0, s7
 ; GFX90A-NEXT:    v_mov_b32_e32 v19, 0
 ; GFX90A-NEXT:    v_rcp_iflag_f32_e32 v2, v0
 ; GFX90A-NEXT:    v_pk_mov_b32 v[0:1], 0, 0
 ; GFX90A-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
 ; GFX90A-NEXT:    v_cvt_u32_f32_e32 v3, v2
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s7
-; GFX90A-NEXT:    v_readfirstlane_b32 s10, v3
-; GFX90A-NEXT:    s_mul_i32 s8, s8, s10
-; GFX90A-NEXT:    s_mul_hi_u32 s8, s10, s8
-; GFX90A-NEXT:    s_add_i32 s10, s10, s8
-; GFX90A-NEXT:    s_mul_hi_u32 s8, s2, s10
-; GFX90A-NEXT:    s_mul_i32 s10, s8, s3
-; GFX90A-NEXT:    s_sub_i32 s2, s2, s10
-; GFX90A-NEXT:    s_add_i32 s11, s8, 1
-; GFX90A-NEXT:    s_sub_i32 s10, s2, s3
-; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX90A-NEXT:    s_cselect_b32 s8, s11, s8
-; GFX90A-NEXT:    s_cselect_b32 s2, s10, s2
-; GFX90A-NEXT:    s_add_i32 s10, s8, 1
-; GFX90A-NEXT:    s_cmp_ge_u32 s2, s3
-; GFX90A-NEXT:    s_cselect_b32 s8, s10, s8
-; GFX90A-NEXT:    s_lshr_b32 s7, s7, 16
-; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, s7
-; GFX90A-NEXT:    s_lshl_b64 s[2:3], s[0:1], 5
-; GFX90A-NEXT:    s_lshl_b64 s[12:13], s[8:9], 5
-; GFX90A-NEXT:    s_lshl_b64 s[10:11], s[4:5], 5
-; GFX90A-NEXT:    s_or_b32 s10, s10, 28
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v2, s0
+; GFX90A-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX90A-NEXT:    s_mul_i32 s1, s1, s2
+; GFX90A-NEXT:    s_mul_hi_u32 s1, s2, s1
+; GFX90A-NEXT:    s_add_i32 s2, s2, s1
+; GFX90A-NEXT:    s_mul_hi_u32 s1, s6, s2
+; GFX90A-NEXT:    s_mul_i32 s2, s1, s7
+; GFX90A-NEXT:    s_sub_i32 s2, s6, s2
+; GFX90A-NEXT:    s_add_i32 s3, s1, 1
+; GFX90A-NEXT:    s_sub_i32 s6, s2, s7
+; GFX90A-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX90A-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX90A-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX90A-NEXT:    s_add_i32 s3, s1, 1
+; GFX90A-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX90A-NEXT:    s_cselect_b32 s8, s3, s1
+; GFX90A-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX90A-NEXT:    v_cvt_f32_f16_e32 v3, s2
+; GFX90A-NEXT:    s_lshl_b64 s[6:7], s[4:5], 5
+; GFX90A-NEXT:    s_lshl_b64 s[14:15], s[10:11], 5
+; GFX90A-NEXT:    s_and_b64 s[0:1], exec, s[0:1]
+; GFX90A-NEXT:    s_or_b32 s14, s14, 28
+; GFX90A-NEXT:    s_lshl_b64 s[16:17], s[8:9], 5
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_readfirstlane_b32 s7, v18
-; GFX90A-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX90A-NEXT:    s_mul_i32 s1, s1, s7
-; GFX90A-NEXT:    s_mul_hi_u32 s9, s0, s7
-; GFX90A-NEXT:    s_mul_i32 s0, s0, s7
-; GFX90A-NEXT:    s_add_i32 s1, s9, s1
-; GFX90A-NEXT:    s_lshl_b64 s[14:15], s[0:1], 5
+; GFX90A-NEXT:    v_readfirstlane_b32 s2, v18
+; GFX90A-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX90A-NEXT:    s_mul_i32 s3, s5, s2
+; GFX90A-NEXT:    s_mul_hi_u32 s5, s4, s2
+; GFX90A-NEXT:    s_mul_i32 s2, s4, s2
+; GFX90A-NEXT:    s_add_i32 s3, s5, s3
+; GFX90A-NEXT:    s_lshl_b64 s[4:5], s[2:3], 5
 ; GFX90A-NEXT:    s_branch .LBB3_2
 ; GFX90A-NEXT:  .LBB3_1: ; %Flow20
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
+; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[2:3]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_12
 ; GFX90A-NEXT:  .LBB3_2: ; %bb9
 ; GFX90A-NEXT:    ; =>This Loop Header: Depth=1
 ; GFX90A-NEXT:    ; Child Loop BB3_5 Depth 2
-; GFX90A-NEXT:    s_mov_b64 s[16:17], -1
-; GFX90A-NEXT:    s_cbranch_scc0 .LBB3_10
+; GFX90A-NEXT:    s_mov_b64 s[18:19], -1
+; GFX90A-NEXT:    s_mov_b64 vcc, s[0:1]
+; GFX90A-NEXT:    s_cbranch_vccz .LBB3_10
 ; GFX90A-NEXT:  ; %bb.3: ; %bb14
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
 ; GFX90A-NEXT:    global_load_dwordx2 v[4:5], v[0:1], off
-; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[0:1], s[4:5], -1
-; GFX90A-NEXT:    s_mov_b32 s7, s6
-; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[0:1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[0:1], 1, v8
-; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[6:7], s[6:7] op_sel:[0,1]
-; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[16:17], s[4:5], 0
-; GFX90A-NEXT:    s_mov_b64 s[18:19], s[10:11]
+; GFX90A-NEXT:    v_cmp_gt_i64_e64 s[2:3], s[10:11], -1
+; GFX90A-NEXT:    s_mov_b32 s13, s12
+; GFX90A-NEXT:    v_cndmask_b32_e64 v8, 0, 1, s[2:3]
+; GFX90A-NEXT:    v_pk_mov_b32 v[6:7], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_cmp_ne_u32_e64 s[2:3], 1, v8
+; GFX90A-NEXT:    v_pk_mov_b32 v[8:9], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_pk_mov_b32 v[10:11], s[12:13], s[12:13] op_sel:[0,1]
+; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[18:19], s[10:11], 0
+; GFX90A-NEXT:    s_mov_b64 s[20:21], s[14:15]
 ; GFX90A-NEXT:    v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    v_readfirstlane_b32 s7, v4
-; GFX90A-NEXT:    v_readfirstlane_b32 s9, v5
-; GFX90A-NEXT:    s_add_u32 s7, s7, 1
-; GFX90A-NEXT:    s_addc_u32 s9, s9, 0
-; GFX90A-NEXT:    s_mul_hi_u32 s20, s2, s7
-; GFX90A-NEXT:    s_mul_i32 s9, s2, s9
-; GFX90A-NEXT:    s_mul_i32 s21, s3, s7
-; GFX90A-NEXT:    s_add_i32 s9, s20, s9
-; GFX90A-NEXT:    s_mul_i32 s7, s2, s7
-; GFX90A-NEXT:    s_add_i32 s9, s9, s21
+; GFX90A-NEXT:    v_readfirstlane_b32 s9, v4
+; GFX90A-NEXT:    v_readfirstlane_b32 s13, v5
+; GFX90A-NEXT:    s_add_u32 s9, s9, 1
+; GFX90A-NEXT:    s_addc_u32 s13, s13, 0
+; GFX90A-NEXT:    s_mul_hi_u32 s22, s6, s9
+; GFX90A-NEXT:    s_mul_i32 s13, s6, s13
+; GFX90A-NEXT:    s_mul_i32 s23, s7, s9
+; GFX90A-NEXT:    s_add_i32 s13, s22, s13
+; GFX90A-NEXT:    s_mul_i32 s9, s6, s9
+; GFX90A-NEXT:    s_add_i32 s13, s13, s23
 ; GFX90A-NEXT:    s_branch .LBB3_5
 ; GFX90A-NEXT:  .LBB3_4: ; %bb58
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_5 Depth=2
 ; GFX90A-NEXT:    v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GFX90A-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
-; GFX90A-NEXT:    s_add_u32 s18, s18, s14
-; GFX90A-NEXT:    s_addc_u32 s19, s19, s15
-; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[22:23], -1, v[4:5]
-; GFX90A-NEXT:    s_mov_b64 s[20:21], 0
-; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
+; GFX90A-NEXT:    s_add_u32 s20, s20, s4
+; GFX90A-NEXT:    s_addc_u32 s21, s21, s5
+; GFX90A-NEXT:    v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
+; GFX90A-NEXT:    s_mov_b64 s[22:23], 0
+; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[24:25]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_9
 ; GFX90A-NEXT:  .LBB3_5: ; %bb16
 ; GFX90A-NEXT:    ; Parent Loop BB3_2 Depth=1
 ; GFX90A-NEXT:    ; => This Inner Loop Header: Depth=2
-; GFX90A-NEXT:    s_add_u32 s20, s18, s7
-; GFX90A-NEXT:    s_addc_u32 s21, s19, s9
-; GFX90A-NEXT:    global_load_dword v21, v19, s[20:21] offset:-12 glc
+; GFX90A-NEXT:    s_add_u32 s22, s20, s9
+; GFX90A-NEXT:    s_addc_u32 s23, s21, s13
+; GFX90A-NEXT:    global_load_dword v21, v19, s[22:23] offset:-12 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v20, v19, s[20:21] offset:-8 glc
+; GFX90A-NEXT:    global_load_dword v20, v19, s[22:23] offset:-8 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v14, v19, s[20:21] offset:-4 glc
+; GFX90A-NEXT:    global_load_dword v14, v19, s[22:23] offset:-4 glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
-; GFX90A-NEXT:    global_load_dword v14, v19, s[20:21] glc
+; GFX90A-NEXT:    global_load_dword v14, v19, s[22:23] glc
 ; GFX90A-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90A-NEXT:    ds_read_b64 v[14:15], v19
 ; GFX90A-NEXT:    ds_read_b64 v[16:17], v0
-; GFX90A-NEXT:    s_and_b64 vcc, exec, s[0:1]
-; GFX90A-NEXT:    ; kill: killed $sgpr20 killed $sgpr21
+; GFX90A-NEXT:    s_and_b64 vcc, exec, s[2:3]
+; GFX90A-NEXT:    ; kill: killed $sgpr22 killed $sgpr23
 ; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX90A-NEXT:    s_cbranch_vccnz .LBB3_7
 ; GFX90A-NEXT:  ; %bb.6: ; %bb51
@@ -800,28 +804,28 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
 ; GFX90A-NEXT:    v_pk_add_f32 v[12:13], v[12:13], v[14:15]
 ; GFX90A-NEXT:    s_branch .LBB3_4
 ; GFX90A-NEXT:  .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
-; GFX90A-NEXT:    s_mov_b64 s[20:21], s[16:17]
-; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[20:21]
+; GFX90A-NEXT:    s_mov_b64 s[22:23], s[18:19]
+; GFX90A-NEXT:    s_andn2_b64 vcc, exec, s[22:23]
 ; GFX90A-NEXT:    s_cbranch_vccz .LBB3_4
 ; GFX90A-NEXT:  ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    s_mov_b64 s[20:21], -1
+; GFX90A-NEXT:    s_mov_b64 s[22:23], -1
 ; GFX90A-NEXT:    ; implicit-def: $vgpr4_vgpr5
-; GFX90A-NEXT:    ; implicit-def: $sgpr18_sgpr19
+; GFX90A-NEXT:    ; implicit-def: $sgpr20_sgpr21
 ; GFX90A-NEXT:  .LBB3_9: ; %loop.exit.guard
 ; GFX90A-NEXT:    ; in Loop: Header=BB3_2 Depth=1
-; GFX90A-NEXT:    s_xor_b64 s[16:17], s[20:21], -1
+; GFX90A-N...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/131314


More information about the llvm-branch-commits mailing list