[llvm] 574a9da - [AMDGPU] Always expand system scope fp atomics on gfx90a
Stanislav Mekhanoshin via llvm-commits
llvm-commits at lists.llvm.org
Wed Mar 10 12:35:33 PST 2021
Author: Stanislav Mekhanoshin
Date: 2021-03-10T12:35:23-08:00
New Revision: 574a9dabc63ba1e7a04c08d4bde2eacd61b44ce1
URL: https://github.com/llvm/llvm-project/commit/574a9dabc63ba1e7a04c08d4bde2eacd61b44ce1
DIFF: https://github.com/llvm/llvm-project/commit/574a9dabc63ba1e7a04c08d4bde2eacd61b44ce1.diff
LOG: [AMDGPU] Always expand system scope fp atomics on gfx90a
FP atomics in system scope cannot be used and shall always
be expanded in a CAS loop.
Differential Revision: https://reviews.llvm.org/D98085
Added:
Modified:
llvm/lib/Target/AMDGPU/SIISelLowering.cpp
llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 051f6d5a4139..516d272deab3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11949,9 +11949,15 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
.getValueAsString() != "true")
return AtomicExpansionKind::CmpXChg;
- if (Subtarget->hasGFX90AInsts())
+ if (Subtarget->hasGFX90AInsts()) {
+ auto SSID = RMW->getSyncScopeID();
+ if (SSID == SyncScope::System ||
+ SSID == RMW->getContext().getOrInsertSyncScopeID("one-as"))
+ return AtomicExpansionKind::CmpXChg;
+
return (Ty->isFloatTy() && AS == AMDGPUAS::FLAT_ADDRESS) ?
AtomicExpansionKind::CmpXChg : AtomicExpansionKind::None;
+ }
if (!Subtarget->hasGFX90AInsts() && AS != AMDGPUAS::GLOBAL_ADDRESS)
return AtomicExpansionKind::CmpXChg;
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 920fee4edb50..d2547adfeda0 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -415,19 +415,80 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(double addrspace(1)*
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: BB24_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc scc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_cbranch_execnz BB24_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_endpgm
+main_body:
+ %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(double addrspace(1)* %ptr) #1 {
+; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: s_endpgm
+main_body:
+ %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(double addrspace(1)* %ptr) #1 {
+; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
+; GFX90A-NEXT: BB26_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] scc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc scc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_cbranch_execnz BB26_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_endpgm
main_body:
- %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst
+ %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("one-as") seq_cst
ret void
}
@@ -440,7 +501,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(double addrspa
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: BB25_1: ; %atomicrmw.start
+; GFX90A-NEXT: BB27_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
@@ -455,11 +516,11 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(double addrspa
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz BB25_1
+; GFX90A-NEXT: s_cbranch_execnz BB27_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_endpgm
main_body:
- %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst
+ %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
ret void
}
@@ -479,18 +540,79 @@ define double @global_atomic_fadd_f64_rtn_pat(double addrspace(1)* %ptr, double
; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: BB29_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc scc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz BB29_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+main_body:
+ %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst
+ ret double %ret
+}
+
+define double @global_atomic_fadd_f64_rtn_pat_agent(double addrspace(1)* %ptr, double %data) #1 {
+; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_agent:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+main_body:
+ %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("agent") seq_cst
+ ret double %ret
+}
+
+define double @global_atomic_fadd_f64_rtn_pat_system(double addrspace(1)* %ptr, double %data) #1 {
+; GFX90A-LABEL: global_atomic_fadd_f64_rtn_pat_system:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: BB31_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f64 v[0:1], v[0:1], v[2:3], off glc scc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5], off glc scc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz BB31_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
main_body:
- %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 seq_cst
+ %ret = atomicrmw fadd double addrspace(1)* %ptr, double 4.0 syncscope("one-as") seq_cst
ret double %ret
}
@@ -522,20 +644,81 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(double* %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX90A-NEXT: BB34_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc scc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_cbranch_execnz BB34_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_endpgm
+main_body:
+ %ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(double* %ptr) #1 {
+; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: s_endpgm
+main_body:
+ %ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(double* %ptr) #1 {
+; GFX90A-LABEL: flat_atomic_fadd_f64_noret_pat_system:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX90A-NEXT: BB36_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], s[0:1], s[0:1] op_sel:[0,1]
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] scc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc scc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_cbranch_execnz BB36_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_endpgm
main_body:
- %ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst
+ %ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("one-as") seq_cst
ret void
}
@@ -543,18 +726,80 @@ define double @flat_atomic_fadd_f64_rtn_pat(double* %ptr) #1 {
; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat:
; GFX90A: ; %bb.0: ; %main_body
; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: BB37_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc scc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz BB37_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+main_body:
+ %ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst
+ ret double %ret
+}
+
+define double @flat_atomic_fadd_f64_rtn_pat_agent(double* %ptr) #1 {
+; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_agent:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: v_mov_b32_e32 v3, 0x40100000
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_setpc_b64 s[30:31]
+main_body:
+ %ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("agent") seq_cst
+ ret double %ret
+}
+
+define double @flat_atomic_fadd_f64_rtn_pat_system(double* %ptr) #1 {
+; GFX90A-LABEL: flat_atomic_fadd_f64_rtn_pat_system:
+; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: flat_load_dwordx2 v[2:3], v[0:1]
+; GFX90A-NEXT: s_mov_b64 s[4:5], 0
+; GFX90A-NEXT: BB39_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: v_pk_mov_b32 v[4:5], v[2:3], v[2:3] op_sel:[0,1]
+; GFX90A-NEXT: v_add_f64 v[2:3], v[4:5], 4.0
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: flat_atomic_add_f64 v[0:1], v[0:1], v[2:3] glc scc
+; GFX90A-NEXT: flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[2:5] glc scc
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX90A-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: s_cbranch_execnz BB39_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX90A-NEXT: v_mov_b32_e32 v0, v2
+; GFX90A-NEXT: v_mov_b32_e32 v1, v3
; GFX90A-NEXT: s_setpc_b64 s[30:31]
main_body:
- %ret = atomicrmw fadd double* %ptr, double 4.0 seq_cst
+ %ret = atomicrmw fadd double* %ptr, double 4.0 syncscope("one-as") seq_cst
ret double %ret
}
@@ -696,7 +941,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(double addrspac
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v0, s0
; GFX90A-NEXT: ds_read_b64 v[0:1], v0
-; GFX90A-NEXT: BB41_1: ; %atomicrmw.start
+; GFX90A-NEXT: BB49_1: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_add_f64 v[2:3], v[0:1], 4.0
@@ -708,7 +953,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(double addrspac
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[2:3], v[2:3] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz BB41_1
+; GFX90A-NEXT: s_cbranch_execnz BB49_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_endpgm
main_body:
diff --git a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
index e5df5c50a4ae..09437b33db57 100644
--- a/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-atomics-fp.ll
@@ -59,15 +59,29 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32(float addrspace(1)* %ptr)
; GFX90A-LABEL: global_atomic_fadd_ret_f32:
; GFX90A: ; %bb.0:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: BB0_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1
; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc scc
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_cbranch_execnz BB0_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX90A-NEXT: global_store_dword v[0:1], v0, off
; GFX90A-NEXT: s_endpgm
%result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
@@ -156,7 +170,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_ieee(float addrspace(1)* %
; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
; GFX90A-NEXT: global_store_dword v[0:1], v0, off
; GFX90A-NEXT: s_endpgm
- %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
+ %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
store float %result, float addrspace(1)* undef
ret void
}
@@ -202,15 +216,12 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32(float addrspace(1)* %ptr
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
; GFX90A-NEXT: v_mov_b32_e32 v0, 0
; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
-; GFX90A-NEXT: buffer_wbl2
; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1] scc
-; GFX90A-NEXT: s_waitcnt vmcnt(0)
-; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: global_atomic_add_f32 v0, v1, s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: s_endpgm
- %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
+ %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
ret void
}
@@ -289,7 +300,162 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_ieee(float addrspace(1)*
; GFX90A-NEXT: s_cbranch_execnz BB3_1
; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
; GFX90A-NEXT: s_endpgm
- %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
+ %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_ret_f32_agent(float addrspace(1)* %ptr) #0 {
+; GFX900-LABEL: global_atomic_fadd_ret_f32_agent:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT: s_mov_b64 s[2:3], 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s4
+; GFX900-NEXT: BB4_1: ; %atomicrmw.start
+; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT: v_mov_b32_e32 v1, v0
+; GFX900-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX900-NEXT: s_cbranch_execnz BB4_1
+; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX900-NEXT: global_store_dword v[0:1], v0, off
+; GFX900-NEXT: s_endpgm
+;
+; GFX908-LABEL: global_atomic_fadd_ret_f32_agent:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX908-NEXT: s_mov_b64 s[2:3], 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, s4
+; GFX908-NEXT: BB4_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
+; GFX908-NEXT: v_mov_b32_e32 v2, 0
+; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1_vol
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX908-NEXT: s_cbranch_execnz BB4_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX908-NEXT: global_store_dword v[0:1], v0, off
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: global_atomic_fadd_ret_f32_agent:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 4.0
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_add_f32 v0, v0, v1, s[0:1] glc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: global_store_dword v[0:1], v0, off
+; GFX90A-NEXT: s_endpgm
+ %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
+ store float %result, float addrspace(1)* undef
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_ret_f32_system(float addrspace(1)* %ptr) #0 {
+; GFX900-LABEL: global_atomic_fadd_ret_f32_system:
+; GFX900: ; %bb.0:
+; GFX900-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX900-NEXT: s_mov_b64 s[2:3], 0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX900-NEXT: s_waitcnt lgkmcnt(0)
+; GFX900-NEXT: v_mov_b32_e32 v0, s4
+; GFX900-NEXT: BB5_1: ; %atomicrmw.start
+; GFX900-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX900-NEXT: v_mov_b32_e32 v1, v0
+; GFX900-NEXT: v_mov_b32_e32 v2, 0
+; GFX900-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX900-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX900-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX900-NEXT: s_waitcnt vmcnt(0)
+; GFX900-NEXT: buffer_wbinvl1_vol
+; GFX900-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX900-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX900-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX900-NEXT: s_cbranch_execnz BB5_1
+; GFX900-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX900-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX900-NEXT: global_store_dword v[0:1], v0, off
+; GFX900-NEXT: s_endpgm
+;
+; GFX908-LABEL: global_atomic_fadd_ret_f32_system:
+; GFX908: ; %bb.0:
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX908-NEXT: s_mov_b64 s[2:3], 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, s4
+; GFX908-NEXT: BB5_1: ; %atomicrmw.start
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_mov_b32_e32 v1, v0
+; GFX908-NEXT: v_mov_b32_e32 v2, 0
+; GFX908-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX908-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX908-NEXT: s_waitcnt vmcnt(0)
+; GFX908-NEXT: buffer_wbinvl1_vol
+; GFX908-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX908-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX908-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX908-NEXT: s_cbranch_execnz BB5_1
+; GFX908-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX908-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX908-NEXT: global_store_dword v[0:1], v0, off
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: global_atomic_fadd_ret_f32_system:
+; GFX90A: ; %bb.0:
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_mov_b64 s[2:3], 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: BB5_1: ; %atomicrmw.start
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mov_b32_e32 v1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v2, 0
+; GFX90A-NEXT: v_add_f32_e32 v0, 4.0, v1
+; GFX90A-NEXT: buffer_wbl2
+; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX90A-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc scc
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_invl2
+; GFX90A-NEXT: s_waitcnt vmcnt(0)
+; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: s_cbranch_execnz BB5_1
+; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX90A-NEXT: global_store_dword v[0:1], v0, off
+; GFX90A-NEXT: s_endpgm
+ %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("one-as") seq_cst
+ store float %result, float addrspace(1)* undef
ret void
}
@@ -302,7 +468,7 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addr
; GCN-NEXT: s_load_dword s4, s[0:1], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
-; GCN-NEXT: BB4_1: ; %atomicrmw.start
+; GCN-NEXT: BB6_1: ; %atomicrmw.start
; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
; GCN-NEXT: v_mov_b32_e32 v1, v0
; GCN-NEXT: v_mov_b32_e32 v2, 0
@@ -314,12 +480,12 @@ define amdgpu_kernel void @global_atomic_fadd_ret_f32_wrong_subtarget(float addr
; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
; GCN-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GCN-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GCN-NEXT: s_cbranch_execnz BB4_1
+; GCN-NEXT: s_cbranch_execnz BB6_1
; GCN-NEXT: ; %bb.2: ; %atomicrmw.end
; GCN-NEXT: s_or_b64 exec, exec, s[2:3]
; GCN-NEXT: global_store_dword v[0:1], v0, off
; GCN-NEXT: s_endpgm
- %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
+ %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
store float %result, float addrspace(1)* undef
ret void
}
@@ -335,7 +501,7 @@ define amdgpu_kernel void @global_atomic_fadd_noret_f32_wrong_subtarget(float ad
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: buffer_wbinvl1_vol
; GCN-NEXT: s_endpgm
- %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 seq_cst
+ %result = atomicrmw fadd float addrspace(1)* %ptr, float 4.0 syncscope("agent") seq_cst
ret void
}
More information about the llvm-commits
mailing list