[llvm] 94def1b - [AMDGPU] Do not exapnd fp atomics on gfx940
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Wed Feb 8 13:22:27 PST 2023
Author: Stanislav Mekhanoshin
Date: 2023-02-08T13:22:04-08:00
New Revision: 94def1b44eef56c8593599818ead34883cddd8d4
URL: https://github.com/llvm/llvm-project/commit/94def1b44eef56c8593599818ead34883cddd8d4
DIFF: https://github.com/llvm/llvm-project/commit/94def1b44eef56c8593599818ead34883cddd8d4.diff
LOG: [AMDGPU] Do not exapnd fp atomics on gfx940
FP atomics are safe on gfx940. This fixes regression after D131560.
Fixes: SWDEV-380468
Differential Revision: https://reviews.llvm.org/D143603
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index b3d5f11978814..2ef194a66bafd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -12942,6 +12942,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
if (AMDGPU::isFlatGlobalAddrSpace(AS) &&
Subtarget->hasAtomicFaddNoRtnInsts()) {
+ if (Subtarget->hasGFX940Insts())
+ return AtomicExpansionKind::None;
+
if (unsafeFPAtomicsDisabled(RMW->getFunction()))
return AtomicExpansionKind::CmpXChg;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
index 31d35df6d0990..dd5b07ee28bb6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx940.ll
@@ -28,26 +28,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: flat_load_dword v1, v[0:1]
-; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB1_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_endpgm
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
ret void
@@ -57,26 +45,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: flat_load_dword v1, v[0:1]
-; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB2_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_endpgm
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
ret void
@@ -97,25 +73,12 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v2, v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_add_f32_e32 v2, 4.0, v3
+; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB4_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
index acad79d009a48..eb300bb7baff6 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-expand.ll
@@ -56,25 +56,11 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
; GFX940-LABEL: syncscope_system:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB0_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_add_f32_e32 v4, v5, v2
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB0_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-LABEL: syncscope_system:
@@ -373,23 +359,8 @@ define float @no_unsafe(ptr %addr, float %val) {
; GFX940-LABEL: no_unsafe:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v3, v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB3_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v5, v3
-; GFX940-NEXT: v_add_f32_e32 v4, v5, v2
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_cmpswap v3, v[0:1], v[4:5] sc0
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB3_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
;
; GFX1100-LABEL: no_unsafe:
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
index 0872c61601999..4bd9991912977 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx940.ll
@@ -28,26 +28,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: flat_load_dword v1, v[0:1]
-; GFX940-NEXT: .LBB1_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB1_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_endpgm
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
ret void
@@ -57,26 +45,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
; GFX940-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: flat_load_dword v1, v[0:1]
-; GFX940-NEXT: .LBB2_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_f32_e32 v0, 4.0, v1
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v[0:1], v2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b32_e32 v1, v0
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB2_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_endpgm
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
ret void
@@ -97,25 +73,12 @@ define float @flat_atomic_fadd_f32_rtn_pat(ptr %ptr, float %data) {
; GFX940-LABEL: flat_atomic_fadd_f32_rtn_pat:
; GFX940: ; %bb.0:
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dword v2, v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB4_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v3, v2
-; GFX940-NEXT: v_add_f32_e32 v2, 4.0, v3
+; GFX940-NEXT: v_mov_b32_e32 v2, 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f32 v0, v[0:1], v2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB4_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
; GFX940-NEXT: s_setpc_b64 s[30:31]
%ret = atomicrmw fadd ptr %ptr, float 4.0 seq_cst
ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index ae62666a6878e..a9200419569ed 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -704,26 +704,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
-; GFX940-NEXT: .LBB24_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB24_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst
@@ -789,26 +776,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
-; GFX940-NEXT: .LBB26_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] sc0 sc1
+; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB26_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst
@@ -893,26 +867,12 @@ define double @global_atomic_fadd_f64_rtn_pat(ptr addrspace(1) %ptr, double %dat
; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB29_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1
+; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB29_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
-; GFX940-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst
@@ -976,26 +936,12 @@ define double @global_atomic_fadd_f64_rtn_pat_system(ptr addrspace(1) %ptr, doub
; GFX940-LABEL: global_atomic_fadd_f64_rtn_pat_system:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB31_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off sc0 sc1
+; GFX940-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB31_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
-; GFX940-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst
@@ -1068,26 +1014,13 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
-; GFX940-NEXT: v_mov_b32_e32 v4, 0
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], s[4:5]
-; GFX940-NEXT: .LBB34_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
+; GFX940-NEXT: v_mov_b32_e32 v2, 0
+; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] sc0
+; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB34_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1124,26 +1057,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX940-NEXT: .LBB35_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB35_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst
@@ -1212,27 +1133,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_system:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX940-NEXT: .LBB37_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB37_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst
@@ -1269,26 +1177,12 @@ define double @flat_atomic_fadd_f64_rtn_pat(ptr %ptr) #1 {
; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB38_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB38_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
-; GFX940-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 seq_cst
@@ -1353,27 +1247,13 @@ define double @flat_atomic_fadd_f64_rtn_pat_system(ptr %ptr) #1 {
; GFX940-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB40_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], v[2:3]
-; GFX940-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] sc0 sc1
+; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] sc0 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
-; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB40_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
-; GFX940-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX940-NEXT: v_mov_b32_e32 v0, v2
-; GFX940-NEXT: v_mov_b32_e32 v1, v3
; GFX940-NEXT: s_setpc_b64 s[30:31]
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("one-as") seq_cst
@@ -1454,26 +1334,14 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
; GFX940-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
; GFX940: ; %bb.0: ; %main_body
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: s_mov_b64 s[2:3], 0
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
-; GFX940-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
-; GFX940-NEXT: .LBB43_1: ; %atomicrmw.start
-; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
-; GFX940-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX940-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] sc0
+; GFX940-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; GFX940-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX940-NEXT: buffer_inv sc1
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
-; GFX940-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
-; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[0:1]
-; GFX940-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX940-NEXT: s_cbranch_execnz .LBB43_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr %ptr, double 4.0 syncscope("agent") seq_cst
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
index 22eec1d98072f..5d7825bb37887 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd-flat-specialization.ll
@@ -38,20 +38,8 @@ define float @syncscope_system(ptr %addr, float %val) #0 {
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX940-LABEL: @syncscope_system(
-; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr [[ADDR:%.*]], align 4
-; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GFX940: atomicrmw.start:
-; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
-; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
-; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940: atomicrmw.end:
-; GFX940-NEXT: ret float [[TMP5]]
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[ADDR:%.*]], float [[VAL:%.*]] seq_cst, align 4
+; GFX940-NEXT: ret float [[RES]]
;
; GFX1100-LABEL: @syncscope_system(
; GFX1100-NEXT: [[TMP1:%.*]] = load float, ptr [[ADDR:%.*]], align 4
@@ -278,20 +266,8 @@ define float @no_unsafe(ptr %addr, float %val) {
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX940-LABEL: @no_unsafe(
-; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr [[ADDR:%.*]], align 4
-; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GFX940: atomicrmw.start:
-; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VAL:%.*]]
-; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
-; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ADDR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("workgroup") seq_cst seq_cst, align 4
-; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940: atomicrmw.end:
-; GFX940-NEXT: ret float [[TMP5]]
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[ADDR:%.*]], float [[VAL:%.*]] syncscope("workgroup") seq_cst, align 4
+; GFX940-NEXT: ret float [[RES]]
;
; GFX1100-LABEL: @no_unsafe(
; GFX1100-NEXT: [[TMP1:%.*]] = load float, ptr [[ADDR:%.*]], align 4
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
index 87c4e92684200..6287b5f7ffdbc 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-rmw-fadd.ll
@@ -433,20 +433,8 @@ define float @test_atomicrmw_fadd_f32_flat(ptr %ptr, float %value) {
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX940-LABEL: @test_atomicrmw_fadd_f32_flat(
-; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
-; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GFX940: atomicrmw.start:
-; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
-; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940: atomicrmw.end:
-; GFX940-NEXT: ret float [[TMP5]]
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4
+; GFX940-NEXT: ret float [[RES]]
;
; GFX11-LABEL: @test_atomicrmw_fadd_f32_flat(
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr [[PTR:%.*]], align 4
@@ -534,20 +522,8 @@ define float @test_atomicrmw_fadd_f32_global(ptr addrspace(1) %ptr, float %value
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX940-LABEL: @test_atomicrmw_fadd_f32_global(
-; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GFX940: atomicrmw.start:
-; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
-; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940: atomicrmw.end:
-; GFX940-NEXT: ret float [[TMP5]]
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4
+; GFX940-NEXT: ret float [[RES]]
;
; GFX11-LABEL: @test_atomicrmw_fadd_f32_global(
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4
@@ -635,19 +611,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_ieee(ptr addrspace(1) %ptr, f
; GFX90A-NEXT: ret void
;
; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_no_use_ieee(
-; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GFX940: atomicrmw.start:
-; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
-; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940: atomicrmw.end:
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4
; GFX940-NEXT: ret void
;
; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_no_use_ieee(
@@ -736,19 +700,7 @@ define void @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(ptr addrspace(1)
; GFX90A-NEXT: ret void
;
; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(
-; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GFX940: atomicrmw.start:
-; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
-; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] seq_cst seq_cst, align 4
-; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940: atomicrmw.end:
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] seq_cst, align 4
; GFX940-NEXT: ret void
;
; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_no_use_denorm_flush(
@@ -1582,20 +1534,8 @@ define double @test_atomicrmw_fadd_f64_flat(ptr %ptr, double %value) {
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX940-LABEL: @test_atomicrmw_fadd_f64_flat(
-; GFX940-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8
-; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GFX940: atomicrmw.start:
-; GFX940-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
-; GFX940-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
-; GFX940-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
-; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
-; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
-; GFX940-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940: atomicrmw.end:
-; GFX940-NEXT: ret double [[TMP5]]
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr [[PTR:%.*]], double [[VALUE:%.*]] seq_cst, align 8
+; GFX940-NEXT: ret double [[RES]]
;
; GFX11-LABEL: @test_atomicrmw_fadd_f64_flat(
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr [[PTR:%.*]], align 8
@@ -1683,20 +1623,8 @@ define double @test_atomicrmw_fadd_f64_global(ptr addrspace(1) %ptr, double %val
; GFX90A-NEXT: ret double [[TMP5]]
;
; GFX940-LABEL: @test_atomicrmw_fadd_f64_global(
-; GFX940-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8
-; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GFX940: atomicrmw.start:
-; GFX940-NEXT: [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT: [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE:%.*]]
-; GFX940-NEXT: [[TMP2:%.*]] = bitcast double [[NEW]] to i64
-; GFX940-NEXT: [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
-; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[TMP3]], i64 [[TMP2]] seq_cst seq_cst, align 8
-; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
-; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
-; GFX940-NEXT: [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
-; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940: atomicrmw.end:
-; GFX940-NEXT: ret double [[TMP5]]
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VALUE:%.*]] seq_cst, align 8
+; GFX940-NEXT: ret double [[RES]]
;
; GFX11-LABEL: @test_atomicrmw_fadd_f64_global(
; GFX11-NEXT: [[TMP1:%.*]] = load double, ptr addrspace(1) [[PTR:%.*]], align 8
@@ -1861,20 +1789,8 @@ define float @test_atomicrmw_fadd_f32_global_agent(ptr addrspace(1) %ptr, float
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_agent(
-; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GFX940: atomicrmw.start:
-; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
-; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") monotonic monotonic, align 4
-; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940: atomicrmw.end:
-; GFX940-NEXT: ret float [[TMP5]]
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] syncscope("agent") monotonic, align 4
+; GFX940-NEXT: ret float [[RES]]
;
; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_agent(
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4
@@ -1962,20 +1878,8 @@ define float @test_atomicrmw_fadd_f32_global_one_as(ptr addrspace(1) %ptr, float
; GFX90A-NEXT: ret float [[TMP5]]
;
; GFX940-LABEL: @test_atomicrmw_fadd_f32_global_one_as(
-; GFX940-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4
-; GFX940-NEXT: br label [[ATOMICRMW_START:%.*]]
-; GFX940: atomicrmw.start:
-; GFX940-NEXT: [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], [[ATOMICRMW_START]] ]
-; GFX940-NEXT: [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE:%.*]]
-; GFX940-NEXT: [[TMP2:%.*]] = bitcast float [[NEW]] to i32
-; GFX940-NEXT: [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
-; GFX940-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("one-as") monotonic monotonic, align 4
-; GFX940-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
-; GFX940-NEXT: [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
-; GFX940-NEXT: [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
-; GFX940-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
-; GFX940: atomicrmw.end:
-; GFX940-NEXT: ret float [[TMP5]]
+; GFX940-NEXT: [[RES:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VALUE:%.*]] syncscope("one-as") monotonic, align 4
+; GFX940-NEXT: ret float [[RES]]
;
; GFX11-LABEL: @test_atomicrmw_fadd_f32_global_one_as(
; GFX11-NEXT: [[TMP1:%.*]] = load float, ptr addrspace(1) [[PTR:%.*]], align 4
More information about the llvm-commits
mailing list