[llvm] [WIP][AMDGPU] Make `getAssumedAddrSpace` return AS1 for pointer kernel arguments (PR #137488)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Apr 26 20:09:50 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Shilei Tian (shiltian)
<details>
<summary>Changes</summary>
---
Patch is 1.04 MiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/137488.diff
37 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (+4)
- (modified) llvm/lib/Transforms/IPO/AttributorAttributes.cpp (+6-17)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll (+207-492)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll (+293-647)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp-atomics-gfx942.ll (+32-12)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll (+120-44)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll (+8-52)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-amdgpu_kernel.ll (+4-2)
- (modified) llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll (+120-232)
- (modified) llvm/test/CodeGen/AMDGPU/aa-as-infer.ll (+2-4)
- (modified) llvm/test/CodeGen/AMDGPU/amd.endpgm.ll (+8-11)
- (modified) llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll (+6-6)
- (modified) llvm/test/CodeGen/AMDGPU/cluster_stores.ll (+86-138)
- (modified) llvm/test/CodeGen/AMDGPU/combine-vload-extract.ll (+18-16)
- (modified) llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll (+9-25)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics.ll (+967-960)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i32_system.ll (+1009-617)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll (+724-616)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll (+574-466)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll (+1200-1760)
- (modified) llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll (+1249-680)
- (modified) llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll (+66-24)
- (modified) llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll (+35-35)
- (modified) llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll (+8-48)
- (modified) llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll (+2-6)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll (+141-224)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll (+11-103)
- (modified) llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll (+11-103)
- (modified) llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll (+8-12)
- (modified) llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll (+160-176)
- (modified) llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll (+15)
- (modified) llvm/test/CodeGen/AMDGPU/offset-split-flat.ll (+160-57)
- (modified) llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll (+5-4)
- (modified) llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll (+32-72)
- (modified) llvm/test/Transforms/InferAddressSpaces/AMDGPU/flat_atomic.ll (+47-28)
- (modified) llvm/test/Transforms/OpenMP/barrier_removal.ll (+16-7)
- (modified) llvm/test/Transforms/OpenMP/spmdization_guarding.ll (+40-28)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index eb50617b281a1..150060e1b266c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -951,6 +951,10 @@ bool AMDGPUTargetMachine::isNoopAddrSpaceCast(unsigned SrcAS,
}
unsigned AMDGPUTargetMachine::getAssumedAddrSpace(const Value *V) const {
+ if (auto *Arg = dyn_cast<Argument>(V);
+ Arg && AMDGPU::isKernelCC(Arg->getParent()))
+ return AMDGPUAS::GLOBAL_ADDRESS;
+
const auto *LD = dyn_cast<LoadInst>(V);
if (!LD) // TODO: Handle invariant load like constant.
return AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
index 1032dede7cb36..b6ad73a6acc94 100644
--- a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -12597,29 +12597,18 @@ struct AAAddressSpaceImpl : public AAAddressSpace {
}
ChangeStatus updateImpl(Attributor &A) override {
- unsigned FlatAS = A.getInfoCache().getFlatAddressSpace().value();
uint32_t OldAddressSpace = AssumedAddressSpace;
auto CheckAddressSpace = [&](Value &Obj) {
if (isa<UndefValue>(&Obj))
return true;
- // If an argument in flat address space only has addrspace cast uses, and
- // those casts are same, then we take the dst addrspace.
if (auto *Arg = dyn_cast<Argument>(&Obj)) {
- if (Arg->getType()->getPointerAddressSpace() == FlatAS) {
- unsigned CastAddrSpace = FlatAS;
- for (auto *U : Arg->users()) {
- auto *ASCI = dyn_cast<AddrSpaceCastInst>(U);
- if (!ASCI)
- return takeAddressSpace(Obj.getType()->getPointerAddressSpace());
- if (CastAddrSpace != FlatAS &&
- CastAddrSpace != ASCI->getDestAddressSpace())
- return false;
- CastAddrSpace = ASCI->getDestAddressSpace();
- }
- if (CastAddrSpace != FlatAS)
- return takeAddressSpace(CastAddrSpace);
- }
+ auto *TTI =
+ A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(
+ *Arg->getParent());
+ unsigned AssumedAS = TTI->getAssumedAddrSpace(Arg);
+ if (AssumedAS != ~0U)
+ return takeAddressSpace(AssumedAS);
}
return takeAddressSpace(Obj.getType()->getPointerAddressSpace());
};
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index aeb301939e986..0347d9e5c3110 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -946,7 +946,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -964,7 +964,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -974,52 +974,38 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_ret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: flat_atomic_dec_ret_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v2, 42
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: flat_atomic_dec_ret_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%result = atomicrmw udec_wrap ptr %ptr, i32 42 syncscope("agent") seq_cst, align 4
store i32 %result, ptr %out, align 4
@@ -1040,7 +1026,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -1060,7 +1046,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1070,54 +1056,38 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v2, 42
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s2, s2, 16
-; GFX10-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr %ptr, i32 4
%result = atomicrmw udec_wrap ptr %gep, i32 42 syncscope("agent") seq_cst, align 4
@@ -1139,7 +1109,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -1159,7 +1129,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1169,54 +1139,38 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s2
-; GFX9-NEXT: v_mov_b32_e32 v1, s3
-; GFX9-NEXT: flat_atomic_dec v2, v[0:1], v2 offset:16 glc
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_store_dword v[0:1], v2
+; GFX9-NEXT: global_store_dword v1, v0, s[0:1]
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v2, 42
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s2, s2, 16
-; GFX10-NEXT: s_addc_u32 s3, s3, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: v_mov_b32_e32 v1, s3
-; GFX10-NEXT: flat_atomic_dec v2, v[0:1], v2 glc
-; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX10-NEXT: global_atomic_dec v0, v1, v0, s[2:3] offset:16 glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: flat_store_dword v[0:1], v2
+; GFX10-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
-; GFX11-NEXT: flat_atomic_dec_u32 v2, v[0:1], v2 offset:16 glc
-; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX11-NEXT: global_atomic_dec_u32 v0, v1, v0, s[2:3] offset:16 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
%gep = getelementptr i32, ptr %ptr, i32 4
%result = atomicrmw udec_wrap ptr %gep, i32 42 seq_cst, align 4
@@ -1236,7 +1190,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_atomic_dec v[0:1], v2
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: s_endpgm
;
@@ -1251,37 +1205,28 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_atomic_dec v[0:1], v2
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: flat_atomic_dec_noret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_atomic_dec v[0:1], v2
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v1, v0, s[0:1]
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: flat_atomic_dec_noret_i32:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v2, 42
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: flat_atomic_dec v[0:1], v2
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_atomic_dec v1, v0, s[0:1]
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -1290,11 +1235,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; GFX11-LABEL: flat_atomic_dec_noret_i32:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1]
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -1317,7 +1260,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_atomic_dec v[0:1], v2
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
; CI-NEXT: s_endpgm
;
@@ -1334,39 +1277,28 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_atomic_dec v[0:1], v2
-; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
; VI-NEXT: s_endpgm
;
; GFX9-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
-; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
-; GFX9-NEXT: v_mov_b32_e32 v2, 42
+; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
-; GFX9-NEXT: v_mov_b32_e32 v0, s0
-; GFX9-NEXT: v_mov_b32_e32 v1, s1
-; GFX9-NEXT: flat_atomic_dec v[0:1], v2 offset:16
-; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16
+; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: buffer_wbinvl1_vol
; GFX9-NEXT: s_endpgm
;
; GFX10-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX10: ; %bb.0:
-; GFX10-NEXT: s_add_u32 s12, s12, s17
-; GFX10-NEXT: s_addc_u32 s13, s13, 0
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
-; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v2, 42
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: s_add_u32 s0, s0, 16
-; GFX10-NEXT: s_addc_u32 s1, s1, 0
-; GFX10-NEXT: v_mov_b32_e32 v0, s0
-; GFX10-NEXT: v_mov_b32_e32 v1, s1
-; GFX10-NEXT: flat_atomic_dec v[0:1], v2
+; GFX10-NEXT: v_mov_b32_e32 v0, 42
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-NEXT: global_atomic_dec v1, v0, s[0:1] offset:16
; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: buffer_gl1_inv
; GFX10-NEXT: buffer_gl0_inv
@@ -1375,11 +1307,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; GFX11-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: v_mov_b32_e32 v2, 42
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX11-NEXT: flat_atomic_dec_u32 v[0:1], v2 offset:16
+; GFX11-NEXT: v_dual_mov_b32 v0, 42 :: v_dual_mov_b32 v1, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: global_atomic_dec_u32 v1, v0, s[0:1] offset:16
; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: buffer_gl1_inv
; GFX11-NEXT: buffer_gl0_inv
@@ -1403,7 +1333,7 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_atomic_dec v[0:1], v2
-; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; CI-NEXT:...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/137488
More information about the llvm-commits
mailing list