[llvm-branch-commits] [llvm] AMDGPU: Add baseline tests for flat-may-alias private atomic expansions (PR #109406)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Fri Sep 20 04:55:34 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Matt Arsenault (arsenm)
<details>
<summary>Changes</summary>
---
Patch is 759.08 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109406.diff
4 Files Affected:
- (modified) llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll (+51)
- (added) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll (+6911)
- (added) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll (+9196)
- (added) llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll (+1523)
``````````diff
diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
index e5dcf9ce309cd8..32cb1056022de2 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
@@ -77,6 +77,29 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d
ret void
}
+define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw_noprivate(ptr %ptr, double %data) #0 {
+ ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw_noprivate
+ ; GFX90A_GFX940: bb.0 (%ir-block.0):
+ ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A_GFX940-NEXT: {{ $}}
+ ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A_GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A_GFX940-NEXT: FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
+ ; GFX90A_GFX940-NEXT: S_ENDPGM 0
+ %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %data) #0 {
; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw
; GFX90A_GFX940: bb.0 (%ir-block.0):
@@ -104,8 +127,36 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da
ret double %ret
}
+define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw__noprivate(ptr %ptr, double %data) #0 {
+ ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw__noprivate
+ ; GFX90A_GFX940: bb.0 (%ir-block.0):
+ ; GFX90A_GFX940-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX90A_GFX940-NEXT: {{ $}}
+ ; GFX90A_GFX940-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX90A_GFX940-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX90A_GFX940-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX90A_GFX940-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX90A_GFX940-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_GFX940-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX90A_GFX940-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_GFX940-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+ ; GFX90A_GFX940-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX90A_GFX940-NEXT: [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+ ; GFX90A_GFX940-NEXT: [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+ ; GFX90A_GFX940-NEXT: [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
+ ; GFX90A_GFX940-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0
+ ; GFX90A_GFX940-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1
+ ; GFX90A_GFX940-NEXT: $sgpr0 = COPY [[COPY6]]
+ ; GFX90A_GFX940-NEXT: $sgpr1 = COPY [[COPY7]]
+ ; GFX90A_GFX940-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+ %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+ ret double %ret
+}
+
declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double)
attributes #0 = { nounwind }
!0 = !{}
+!1 = !{i32 5, i32 6}
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll
new file mode 100644
index 00000000000000..c0b3adce81342d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll
@@ -0,0 +1,6911 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN1 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GCN2 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
+; GCN1-LABEL: atomic_add_i64_offset:
+; GCN1: ; %bb.0: ; %entry
+; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: s_add_u32 s0, s0, 32
+; GCN1-NEXT: s_addc_u32 s1, s1, 0
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
+; GCN1-NEXT: v_mov_b32_e32 v0, s2
+; GCN1-NEXT: v_mov_b32_e32 v1, s3
+; GCN1-NEXT: v_mov_b32_e32 v2, s0
+; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: atomic_add_i64_offset:
+; GCN2: ; %bb.0: ; %entry
+; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: s_add_u32 s0, s0, 32
+; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v0, s2
+; GCN2-NEXT: v_mov_b32_e32 v1, s3
+; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_offset:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr %out, i64 4
+ %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
+; GCN1-LABEL: atomic_add_i64_ret_offset:
+; GCN1: ; %bb.0: ; %entry
+; GCN1-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v0, s4
+; GCN1-NEXT: s_add_u32 s0, s0, 32
+; GCN1-NEXT: s_addc_u32 s1, s1, 0
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
+; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s0
+; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
+; GCN1-NEXT: v_mov_b32_e32 v3, s3
+; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: atomic_add_i64_ret_offset:
+; GCN2: ; %bb.0: ; %entry
+; GCN2-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: s_add_u32 s0, s0, 32
+; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GCN2-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret_offset:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i64, ptr %out, i64 4
+ %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+ store i64 %tmp0, ptr %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
+; GCN1-LABEL: atomic_add_i64_addr64_offset:
+; GCN1: ; %bb.0: ; %entry
+; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v0, s6
+; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN1-NEXT: s_add_u32 s0, s4, s0
+; GCN1-NEXT: s_addc_u32 s1, s5, s1
+; GCN1-NEXT: s_add_u32 s0, s0, 32
+; GCN1-NEXT: s_addc_u32 s1, s1, 0
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
+; GCN1-NEXT: v_mov_b32_e32 v1, s7
+; GCN1-NEXT: v_mov_b32_e32 v2, s0
+; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: atomic_add_i64_addr64_offset:
+; GCN2: ; %bb.0: ; %entry
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
+; GCN2-NEXT: s_add_u32 s0, s0, 32
+; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
+; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_addr64_offset:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_add_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_endpgm
+entry:
+ %ptr = getelementptr i64, ptr %out, i64 %index
+ %gep = getelementptr i64, ptr %ptr, i64 4
+ %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GCN1-LABEL: atomic_add_i64_ret_addr64_offset:
+; GCN1: ; %bb.0: ; %entry
+; GCN1-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v0, s4
+; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GCN1-NEXT: s_add_u32 s0, s0, s4
+; GCN1-NEXT: s_addc_u32 s1, s1, s5
+; GCN1-NEXT: s_add_u32 s0, s0, 32
+; GCN1-NEXT: s_addc_u32 s1, s1, 0
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s0
+; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
+; GCN1-NEXT: v_mov_b32_e32 v3, s3
+; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: atomic_add_i64_ret_addr64_offset:
+; GCN2: ; %bb.0: ; %entry
+; GCN2-NEXT: s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: s_lshl_b64 s[4:5], s[6:7], 3
+; GCN2-NEXT: s_add_u32 s0, s0, s4
+; GCN2-NEXT: s_addc_u32 s1, s1, s5
+; GCN2-NEXT: s_add_u32 s0, s0, 32
+; GCN2-NEXT: s_addc_u32 s1, s1, 0
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GCN2-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT: s_endpgm
+entry:
+ %ptr = getelementptr i64, ptr %out, i64 %index
+ %gep = getelementptr i64, ptr %ptr, i64 4
+ %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+ store i64 %tmp0, ptr %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
+; GCN1-LABEL: atomic_add_i64:
+; GCN1: ; %bb.0: ; %entry
+; GCN1-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v0, s0
+; GCN1-NEXT: v_mov_b32_e32 v1, s1
+; GCN1-NEXT: v_mov_b32_e32 v2, s2
+; GCN1-NEXT: v_mov_b32_e32 v3, s3
+; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: atomic_add_i64:
+; GCN2: ; %bb.0: ; %entry
+; GCN2-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v0, s0
+; GCN2-NEXT: v_mov_b32_e32 v1, s1
+; GCN2-NEXT: v_mov_b32_e32 v2, s2
+; GCN2-NEXT: v_mov_b32_e32 v3, s3
+; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_endpgm
+entry:
+ %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
+; GCN1-LABEL: atomic_add_i64_ret:
+; GCN1: ; %bb.0: ; %entry
+; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v0, s4
+; GCN1-NEXT: v_mov_b32_e32 v1, s5
+; GCN1-NEXT: v_mov_b32_e32 v2, s0
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
+; GCN1-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: v_mov_b32_e32 v2, s6
+; GCN1-NEXT: v_mov_b32_e32 v3, s7
+; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: atomic_add_i64_ret:
+; GCN2: ; %bb.0: ; %entry
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v0, s4
+; GCN2-NEXT: v_mov_b32_e32 v1, s5
+; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: v_mov_b32_e32 v2, s6
+; GCN2-NEXT: v_mov_b32_e32 v3, s7
+; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
+; GCN2-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT: v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-NEXT: flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT: s_endpgm
+entry:
+ %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+ store i64 %tmp0, ptr %out2
+ ret void
+}
+
+define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) {
+; GCN1-LABEL: atomic_add_i64_addr64:
+; GCN1: ; %bb.0: ; %entry
+; GCN1-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GCN1-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GCN1-NEXT: s_waitcnt lgkmcnt(0)
+; GCN1-NEXT: v_mov_b32_e32 v0, s6
+; GCN1-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN1-NEXT: s_add_u32 s0, s4, s0
+; GCN1-NEXT: s_addc_u32 s1, s5, s1
+; GCN1-NEXT: v_mov_b32_e32 v3, s1
+; GCN1-NEXT: v_mov_b32_e32 v1, s7
+; GCN1-NEXT: v_mov_b32_e32 v2, s0
+; GCN1-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
+; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN1-NEXT: buffer_wbinvl1_vol
+; GCN1-NEXT: s_endpgm
+;
+; GCN2-LABEL: atomic_add_i64_addr64:
+; GCN2: ; %bb.0: ; %entry
+; GCN2-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GCN2-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GCN2-NEXT: s_waitcnt lgkmcnt(0)
+; GCN2-NEXT: v_mov_b32_e32 v0, s6
+; GCN2-NEXT: s_lshl_b64 s[0:1], s[0:1], 3
+; GCN2-NEXT: s_add_u32 s0, s4, s0
+; GCN2-NEXT: s_addc_u32 s1, s5, s1
+; GCN2-NEXT: v_mov_b32_e32 v3, s1
+; GCN2-NEXT: v_mov_b32_e32 v1, s7
+; GCN2-NEXT: v_mov_b32_e32 v2, s0
+; GCN2-NEXT: flat_atomic_add_x2 v[2:3], v[0:1]
+; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN2-NEXT: buffer_wbinvl1_vol
+; GCN2-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_addr64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_clause 0x1
+; GFX12-NEXT: s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT: s_load_b128 s[0:3...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/109406
More information about the llvm-branch-commits
mailing list