[llvm] [MachineLICM][AArch64] Hoist COPY instructions with other uses in the loop (PR #71403)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Nov 6 07:13:01 PST 2023
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-x86
Author: Rin (Rin18)
<details>
<summary>Changes</summary>
When there is a COPY instruction in the loop with other uses, we want to hoist the COPY, which in turn leads to the users being hoisted as well.
Co-authored by David Green : David.Green@<!-- -->arm.com
---
Patch is 534.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/71403.diff
25 Files Affected:
- (modified) llvm/lib/CodeGen/MachineLICM.cpp (+10)
- (modified) llvm/test/CodeGen/AArch64/tbl-loops.ll (+63-63)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+28-32)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll (+771-917)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (+964-1064)
- (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+28-28)
- (modified) llvm/test/CodeGen/AMDGPU/global-load-saddr-to-vaddr.ll (+7-8)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll (+335-365)
- (modified) llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll (+797-817)
- (modified) llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll (+104-116)
- (modified) llvm/test/CodeGen/AMDGPU/memcpy-crash-issue63986.ll (+12-12)
- (modified) llvm/test/CodeGen/AMDGPU/optimize-negated-cond.ll (+2-1)
- (modified) llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/remat-fp64-constants.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/sdiv64.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/sdwa-peephole.ll (+13-13)
- (modified) llvm/test/CodeGen/AMDGPU/srem64.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/udiv64.ll (+1-1)
- (modified) llvm/test/CodeGen/AMDGPU/urem64.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/2007-01-13-StackPtrIndex.ll (+22-23)
- (modified) llvm/test/CodeGen/X86/2009-02-26-MachineLICMBug.ll (+1-1)
- (modified) llvm/test/CodeGen/X86/avx512-i1test.ll (+1-5)
- (modified) llvm/test/CodeGen/X86/pr38795.ll (-1)
- (modified) llvm/test/CodeGen/X86/pr63108.ll (+4-4)
- (modified) llvm/test/CodeGen/X86/vector-shift-by-select-loop.ll (+70-68)
``````````diff
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index e29f28ecaea0dce..5216662bb9d69db 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -1249,6 +1249,16 @@ bool MachineLICMBase::IsProfitableToHoist(MachineInstr &MI,
return false;
}
+ // If we have a COPY with other uses in the loop, hoist to allow the users to
+ // also be hoisted.
+ if (MI.isCopy() && IsLoopInvariantInst(MI, CurLoop) &&
+ MI.getOperand(0).isReg() &&
+ ajnsdajn MI.getOperand(0).getReg().isVirtual() &&
+ MI.getOperand(1).isReg() && MI.getOperand(1).getReg().isVirtual() &&
+ any_of(MRI->use_nodbg_instructions(MI.getOperand(0).getReg()),
+ [&](MachineInstr &UseMI) { return CurLoop->contains(&UseMI); }))
+ return true;
+
// High register pressure situation, only hoist if the instruction is going
// to be remat'ed.
if (!isTriviallyReMaterializable(MI) &&
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index b63d540fb8e0291..365fe03ab0b0844 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -52,19 +52,19 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: b.eq .LBB0_8
; CHECK-NEXT: .LBB0_6: // %for.body.preheader1
; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: sub w10, w2, w10
; CHECK-NEXT: mov w11, #1132396544 // =0x437f0000
+; CHECK-NEXT: sub w10, w2, w10
+; CHECK-NEXT: fmov s1, w11
; CHECK-NEXT: .LBB0_7: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: fmov s2, w11
-; CHECK-NEXT: ldr s1, [x8], #4
-; CHECK-NEXT: fcmp s1, s2
-; CHECK-NEXT: fcsel s2, s2, s1, gt
-; CHECK-NEXT: fcmp s1, #0.0
-; CHECK-NEXT: fcsel s1, s0, s2, mi
+; CHECK-NEXT: ldr s2, [x8], #4
+; CHECK-NEXT: fcmp s2, s1
+; CHECK-NEXT: fcsel s3, s1, s2, gt
+; CHECK-NEXT: fcmp s2, #0.0
+; CHECK-NEXT: fcsel s2, s0, s3, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: fcvtzs w12, s1
-; CHECK-NEXT: strb w12, [x9], #1
+; CHECK-NEXT: fcvtzs w11, s2
+; CHECK-NEXT: strb w11, [x9], #1
; CHECK-NEXT: b.ne .LBB0_7
; CHECK-NEXT: .LBB0_8: // %for.cond.cleanup
; CHECK-NEXT: ret
@@ -165,25 +165,25 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: mov x9, x0
; CHECK-NEXT: .LBB1_5: // %for.body.preheader1
; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: sub w10, w2, w10
; CHECK-NEXT: mov w11, #1132396544 // =0x437f0000
+; CHECK-NEXT: sub w10, w2, w10
+; CHECK-NEXT: fmov s1, w11
; CHECK-NEXT: .LBB1_6: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp s1, s3, [x8], #8
-; CHECK-NEXT: fmov s2, w11
-; CHECK-NEXT: fcmp s1, s2
-; CHECK-NEXT: fcsel s4, s2, s1, gt
-; CHECK-NEXT: fcmp s1, #0.0
-; CHECK-NEXT: fcsel s1, s0, s4, mi
-; CHECK-NEXT: fcmp s3, s2
-; CHECK-NEXT: fcsel s2, s2, s3, gt
+; CHECK-NEXT: ldp s2, s3, [x8], #8
+; CHECK-NEXT: fcmp s2, s1
+; CHECK-NEXT: fcsel s4, s1, s2, gt
+; CHECK-NEXT: fcmp s2, #0.0
+; CHECK-NEXT: fcsel s2, s0, s4, mi
+; CHECK-NEXT: fcmp s3, s1
+; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
-; CHECK-NEXT: fcvtzs w12, s1
-; CHECK-NEXT: fcsel s2, s0, s2, mi
+; CHECK-NEXT: fcvtzs w11, s2
+; CHECK-NEXT: fcsel s3, s0, s4, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: strb w12, [x9]
-; CHECK-NEXT: fcvtzs w13, s2
-; CHECK-NEXT: strb w13, [x9, #1]
+; CHECK-NEXT: strb w11, [x9]
+; CHECK-NEXT: fcvtzs w12, s3
+; CHECK-NEXT: strb w12, [x9, #1]
; CHECK-NEXT: add x9, x9, #2
; CHECK-NEXT: b.ne .LBB1_6
; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup
@@ -380,33 +380,33 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: mov x9, x0
; CHECK-NEXT: .LBB2_7: // %for.body.preheader1
; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: sub w10, w2, w10
; CHECK-NEXT: mov w11, #1132396544 // =0x437f0000
+; CHECK-NEXT: sub w10, w2, w10
+; CHECK-NEXT: fmov s1, w11
; CHECK-NEXT: .LBB2_8: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp s1, s3, [x8]
-; CHECK-NEXT: fmov s2, w11
-; CHECK-NEXT: fcmp s1, s2
-; CHECK-NEXT: fcsel s4, s2, s1, gt
-; CHECK-NEXT: fcmp s1, #0.0
-; CHECK-NEXT: fcsel s1, s0, s4, mi
-; CHECK-NEXT: fcmp s3, s2
-; CHECK-NEXT: fcsel s4, s2, s3, gt
+; CHECK-NEXT: ldp s2, s3, [x8]
+; CHECK-NEXT: fcmp s2, s1
+; CHECK-NEXT: fcsel s4, s1, s2, gt
+; CHECK-NEXT: fcmp s2, #0.0
+; CHECK-NEXT: fcsel s2, s0, s4, mi
+; CHECK-NEXT: fcmp s3, s1
+; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: ldr s3, [x8, #8]
-; CHECK-NEXT: fcvtzs w12, s1
+; CHECK-NEXT: fcvtzs w11, s2
; CHECK-NEXT: add x8, x8, #12
; CHECK-NEXT: fcsel s4, s0, s4, mi
-; CHECK-NEXT: fcmp s3, s2
-; CHECK-NEXT: strb w12, [x9]
-; CHECK-NEXT: fcsel s2, s2, s3, gt
+; CHECK-NEXT: fcmp s3, s1
+; CHECK-NEXT: strb w11, [x9]
+; CHECK-NEXT: fcsel s5, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
-; CHECK-NEXT: fcvtzs w13, s4
-; CHECK-NEXT: fcsel s2, s0, s2, mi
+; CHECK-NEXT: fcvtzs w12, s4
+; CHECK-NEXT: fcsel s3, s0, s5, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: strb w13, [x9, #1]
-; CHECK-NEXT: fcvtzs w14, s2
-; CHECK-NEXT: strb w14, [x9, #2]
+; CHECK-NEXT: strb w12, [x9, #1]
+; CHECK-NEXT: fcvtzs w13, s3
+; CHECK-NEXT: strb w13, [x9, #2]
; CHECK-NEXT: add x9, x9, #3
; CHECK-NEXT: b.ne .LBB2_8
; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup
@@ -549,39 +549,39 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: mov x9, x0
; CHECK-NEXT: .LBB3_5: // %for.body.preheader1
; CHECK-NEXT: movi d0, #0000000000000000
-; CHECK-NEXT: sub w10, w2, w10
; CHECK-NEXT: mov w11, #1132396544 // =0x437f0000
+; CHECK-NEXT: sub w10, w2, w10
+; CHECK-NEXT: fmov s1, w11
; CHECK-NEXT: .LBB3_6: // %for.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: ldp s1, s3, [x8]
-; CHECK-NEXT: fmov s2, w11
-; CHECK-NEXT: fcmp s1, s2
-; CHECK-NEXT: fcsel s4, s2, s1, gt
-; CHECK-NEXT: fcmp s1, #0.0
-; CHECK-NEXT: fcsel s1, s0, s4, mi
-; CHECK-NEXT: fcmp s3, s2
-; CHECK-NEXT: fcsel s4, s2, s3, gt
+; CHECK-NEXT: ldp s2, s3, [x8]
+; CHECK-NEXT: fcmp s2, s1
+; CHECK-NEXT: fcsel s4, s1, s2, gt
+; CHECK-NEXT: fcmp s2, #0.0
+; CHECK-NEXT: fcsel s2, s0, s4, mi
+; CHECK-NEXT: fcmp s3, s1
+; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: ldp s3, s5, [x8, #8]
-; CHECK-NEXT: fcvtzs w12, s1
+; CHECK-NEXT: fcvtzs w11, s2
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: fcsel s4, s0, s4, mi
-; CHECK-NEXT: fcmp s3, s2
-; CHECK-NEXT: strb w12, [x9]
-; CHECK-NEXT: fcsel s6, s2, s3, gt
+; CHECK-NEXT: fcmp s3, s1
+; CHECK-NEXT: strb w11, [x9]
+; CHECK-NEXT: fcsel s6, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
-; CHECK-NEXT: fcvtzs w13, s4
+; CHECK-NEXT: fcvtzs w12, s4
; CHECK-NEXT: fcsel s3, s0, s6, mi
-; CHECK-NEXT: fcmp s5, s2
-; CHECK-NEXT: strb w13, [x9, #1]
-; CHECK-NEXT: fcsel s2, s2, s5, gt
+; CHECK-NEXT: fcmp s5, s1
+; CHECK-NEXT: strb w12, [x9, #1]
+; CHECK-NEXT: fcsel s6, s1, s5, gt
; CHECK-NEXT: fcmp s5, #0.0
-; CHECK-NEXT: fcvtzs w14, s3
-; CHECK-NEXT: fcsel s2, s0, s2, mi
+; CHECK-NEXT: fcvtzs w13, s3
+; CHECK-NEXT: fcsel s5, s0, s6, mi
; CHECK-NEXT: subs w10, w10, #1
-; CHECK-NEXT: strb w14, [x9, #2]
-; CHECK-NEXT: fcvtzs w15, s2
-; CHECK-NEXT: strb w15, [x9, #3]
+; CHECK-NEXT: strb w13, [x9, #2]
+; CHECK-NEXT: fcvtzs w14, s5
+; CHECK-NEXT: strb w14, [x9, #3]
; CHECK-NEXT: add x9, x9, #4
; CHECK-NEXT: b.ne .LBB3_6
; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index c6ea046f95a9199..53b2336180c6617 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1447,15 +1447,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: .LBB50_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -1463,9 +1462,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB50_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_endpgm
@@ -1522,15 +1521,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
@@ -1539,9 +1537,9 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_endpgm
@@ -1724,23 +1722,22 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: s_mov_b64 s[2:3], 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[4:5]
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: .LBB58_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
-; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
-; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB58_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_endpgm
@@ -1957,22 +1954,21 @@ main_body:
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 {
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x24
-; GFX90A-NEXT: s_mov_b64 s[0:1], 0
+; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: ds_read_b64 v[0:1], v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, s0
+; GFX90A-NEXT: ds_read_b64 v[0:1], v2
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
; GFX90A-NEXT: .LBB67_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], 4.0
-; GFX90A-NEXT: v_mov_b32_e32 v4, s2
+; GFX90A-NEXT: v_add_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[2:3], v4, v[0:1], v[2:3]
+; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1]
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX90A-NEXT: s_cbranch_execnz .LBB67_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1985,17 +1981,17 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v0, s2
; GFX940-NEXT: ds_read_b64 v[0:1], v0
+; GFX940-NEXT: v_mov_b32_e32 v2, s2
; GFX940-NEXT: .LBB67_1: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_add_f64 v[2:3], v[0:1], 4.0
-; GFX940-NEXT: v_mov_b32_e32 v4, s2
+; GFX940-NEXT: v_add_f64 v[4:5], v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: ds_cmpst_rtn_b64 v[2:3], v4, v[0:1], v[2:3]
+; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[0:1]
+; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1]
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[2:3]
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
; GFX940-NEXT: s_cbranch_execnz .LBB67_1
; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
index 6bdeeddc951ff18..c4a88ebf4897294 100644
--- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll
@@ -1819,22 +1819,20 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN1-NEXT: v_mov_b32_e32 v0, s4
; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
+; GCN1-NEXT: flat_load_dword v3, v[0:1]
; GCN1-NEXT: s_mov_b64 s[34:35], 0
; GCN1-NEXT: .LBB44_1: ; %atomicrmw.start
; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
-; GCN1-NEXT: v_mov_b32_e32 v3, s5
-; GCN1-NEXT: v_not_b32_e32 v0, v0
+; GCN1-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN1-NEXT: v_not_b32_e32 v2, v2
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN1-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
+; GCN1-NEXT: v_mov_b32_e32 v3, v2
; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN1-NEXT: s_cbranch_execnz .LBB44_1
; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1846,22 +1844,20 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN2-NEXT: v_mov_b32_e32 v0, s4
; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: flat_load_dword v1, v[0:1]
+; GCN2-NEXT: flat_load_dword v3, v[0:1]
; GCN2-NEXT: s_mov_b64 s[34:35], 0
; GCN2-NEXT: .LBB44_1: ; %atomicrmw.start
; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
-; GCN2-NEXT: v_mov_b32_e32 v3, s5
-; GCN2-NEXT: v_not_b32_e32 v0, v0
+; GCN2-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN2-NEXT: v_not_b32_e32 v2, v2
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN2-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
+; GCN2-NEXT: v_mov_b32_e32 v3, v2
; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN2-NEXT: s_cbranch_execnz .LBB44_1
; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1873,22 +1869,20 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_scalar(ptr inreg %ptr, i32 in
; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN3-NEXT: v_mov_b32_e32 v0, s4
; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: flat_load_dword v1, v[0:1]
+; GCN3-NEXT: flat_load_dword v3, v[0:1]
; GCN3-NEXT: s_mov_b64 s[34:35], 0
; GCN3-NEXT: .LBB44_1: ; %atomicrmw.start
; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_and_b32_e32 v0, s6, v1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: v_not_b32_e32 v0, v0
+; GCN3-NEXT: v_and_b32_e32 v2, s6, v3
+; GCN3-NEXT: v_not_b32_e32 v2, v2
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] glc
+; GCN3-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc
; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
+; GCN3-NEXT: v_mov_b32_e32 v3, v2
; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
; GCN3-NEXT: s_cbranch_execnz .LBB44_1
; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
@@ -1906,26 +1900,24 @@ define amdgpu_gfx void @flat_atomic_nand_i32_noret_offset_scalar(ptr inreg %out,
; GCN1-NEXT: s_addc_u32 s35, s5, 0
; GCN1-NEXT: v_mov_b32_e32 v0, s34
; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: flat_load_dword v1, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
+; GCN1-NEXT: ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/71403
More information about the llvm-commits
mailing list