[llvm] AMDGPU: Drop and upgrade llvm.amdgcn.atomic.csub/cond.sub to atomicrmw (PR #105553)
via llvm-commits
llvm-commits at lists.llvm.org
Tue Dec 9 08:08:29 PST 2025
https://github.com/anjenner updated https://github.com/llvm/llvm-project/pull/105553
>From 38218ed0c21061357bde16989c4148ee949f9cd8 Mon Sep 17 00:00:00 2001
From: Andrew Jenner <Andrew.Jenner at amd.com>
Date: Thu, 15 Aug 2024 10:00:29 -0400
Subject: [PATCH 1/2] AMDGPU: Drop and upgrade llvm.amdgcn.atomic.csub/cond.sub
to atomicrmw
---
llvm/docs/AMDGPUUsage.rst | 5 -
llvm/docs/ReleaseNotes.md | 4 +
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 12 -
llvm/lib/IR/AutoUpgrade.cpp | 11 +-
llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 4 -
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 10 -
.../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 10 +
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 -
.../Target/AMDGPU/AMDGPUSearchableTables.td | 6 -
llvm/lib/Target/AMDGPU/BUFInstructions.td | 2 +-
llvm/lib/Target/AMDGPU/DSInstructions.td | 9 -
llvm/lib/Target/AMDGPU/FLATInstructions.td | 11 -
llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 8 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 30 +-
.../UniformityAnalysis/AMDGPU/atomics.ll | 57 --
llvm/test/Bitcode/amdgcn-atomic.ll | 147 ++++++
.../llvm.amdgcn.global.atomic.csub.ll | 270 ----------
.../test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll | 197 +++++++
.../CodeGen/AMDGPU/atomicrmw_sub_clamp.ll | 495 ++++++++++++++++++
llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 193 ++++---
.../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 219 --------
21 files changed, 979 insertions(+), 723 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/atomicrmw_sub_clamp.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 3e7a5dfc504ae..7ecf1c1124894 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1527,11 +1527,6 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
The iglp_opt strategy implementations are subject to change.
- llvm.amdgcn.atomic.cond.sub.u32 Provides direct access to flat_atomic_cond_sub_u32, global_atomic_cond_sub_u32
- and ds_cond_sub_u32 based on address space on gfx12 targets. This
- performs a subtraction only if the memory value is greater than or
- equal to the data value.
-
llvm.amdgcn.s.barrier.signal.isfirst Provides access to the s_barrier_signal_first instruction;
additionally ensures that the result value is valid even when the
intrinsic is used from a wave that is not running in a workgroup.
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index dc0cec6122cdf..7fcd0fcc5ec91 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -114,6 +114,10 @@ Changes to the AArch64 Backend
Changes to the AMDGPU Backend
-----------------------------
+* Removed `llvm.amdgcn.atomic.cond.sub.u32` and
+ `llvm.amdgcn.atomic.csub.u32` intrinsics. Users should use the
+ `atomicrmw` instruction with `usub_cond` and `usub_sat` instead.
+
Changes to the ARM Backend
--------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 91d72d5ef9dfc..d57b15b745d79 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1702,8 +1702,6 @@ def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
-def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic;
-def int_amdgcn_raw_buffer_atomic_sub_clamp_u32 : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
@@ -1740,8 +1738,6 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
-def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
-def int_amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32 : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
@@ -1782,8 +1778,6 @@ def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
-def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic;
-def int_amdgcn_struct_buffer_atomic_sub_clamp_u32 : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
@@ -1819,8 +1813,6 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic;
-def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic;
-def int_amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32 : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
@@ -2886,8 +2878,6 @@ class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic <
[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "",
[SDNPMemOperand]>;
-def int_amdgcn_global_atomic_csub : AMDGPUAtomicRtn<llvm_i32_ty>;
-
// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>,
// <ray_dir>, <ray_inv_dir>, <texture_descr>
// <node_ptr> is i32 or i64.
@@ -3135,8 +3125,6 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>;
-
class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
Intrinsic<
[llvm_any_ty],
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 487db134b0df3..b757d9d5107f9 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1210,9 +1210,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
}
if (Name.consume_front("atomic.")) {
- if (Name.starts_with("inc") || Name.starts_with("dec")) {
- // These were replaced with atomicrmw uinc_wrap and udec_wrap, so
- // there's no new declaration.
+ if (Name.starts_with("inc") || Name.starts_with("dec") ||
+ Name.starts_with("cond.sub") || Name.starts_with("csub")) {
+ // These were replaced with atomicrmw uinc_wrap, udec_wrap, usub_cond
+ // and usub_sat so there's no new declaration.
NewFn = nullptr;
return true;
}
@@ -4516,7 +4517,9 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
.StartsWith("global.atomic.fmin", AtomicRMWInst::FMin)
.StartsWith("flat.atomic.fmin", AtomicRMWInst::FMin)
.StartsWith("global.atomic.fmax", AtomicRMWInst::FMax)
- .StartsWith("flat.atomic.fmax", AtomicRMWInst::FMax);
+ .StartsWith("flat.atomic.fmax", AtomicRMWInst::FMax)
+ .StartsWith("atomic.cond.sub", AtomicRMWInst::USubCond)
+ .StartsWith("atomic.csub", AtomicRMWInst::USubSat);
unsigned NumOperands = CI->getNumOperands();
if (NumOperands < 3) // Malformed bitcode.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index dd86cb5d3d5a6..2a99dacba52a4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -636,15 +636,11 @@ multiclass local_addr_space_atomic_op {
}
}
-defm int_amdgcn_global_atomic_csub : noret_op;
defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
defm int_amdgcn_flat_atomic_fmin_num : noret_op;
defm int_amdgcn_flat_atomic_fmax_num : noret_op;
defm int_amdgcn_global_atomic_fmin_num : noret_op;
defm int_amdgcn_global_atomic_fmax_num : noret_op;
-defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op;
-defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op;
-defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op;
multiclass noret_binary_atomic_op<SDNode atomic_op> {
let HasNoUse = true in
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 6e3a1b6a5563f..a9749cf2e33e4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6568,16 +6568,6 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
- case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
- case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
- case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
- case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
- return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
- case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
- case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
- case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
- case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
- return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
default:
llvm_unreachable("unhandled atomic opcode");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 97e7a239e1099..3205b038112ef 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1783,6 +1783,16 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
"wrapping increment/decrement not supported for "
"buffer resources and should've been expanded away");
break;
+ case AtomicRMWInst::USubCond:
+ reportFatalUsageError(
+ "conditional subtract not supported for buffer "
+ "resources and should've been expanded away");
+ break;
+ case AtomicRMWInst::USubSat:
+ reportFatalUsageError(
+ "subtract with clamp not supported for buffer "
+ "resources and should've been expanded away");
+ break;
case AtomicRMWInst::BAD_BINOP:
llvm_unreachable("Not sure how we got a bad binop");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index a7955ee2dac40..ce4cc799543f7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5309,12 +5309,10 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
- case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
- case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 2393346839707..1fb8b9f6238e5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -237,8 +237,6 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
def : SourceOfDivergence<int_r600_read_tidig_x>;
def : SourceOfDivergence<int_r600_read_tidig_y>;
def : SourceOfDivergence<int_r600_read_tidig_z>;
-def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
-def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax_num>;
def : SourceOfDivergence<int_amdgcn_global_atomic_ordered_add_b64>;
@@ -260,7 +258,6 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
-def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_sub>;
@@ -277,7 +274,6 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>;
-def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_sub>;
@@ -294,7 +290,6 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
-def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_sub>;
@@ -311,7 +306,6 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;
-def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_live_mask>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index b97b7385dc1ff..82a5769705d54 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1096,7 +1096,7 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
let OtherPredicates = [HasGFX10_BEncoding] in {
defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics <
- "buffer_atomic_csub", VGPROp_32, i32, int_amdgcn_global_atomic_csub
+ "buffer_atomic_csub", VGPROp_32, i32
>;
}
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 4b3bd0c09e076..3a53cef96473c 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -886,15 +886,6 @@ defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc_gfx9<"ds_sub_clamp_rtn_u32", VGPROp_3
def DS_BPERMUTE_FI_B32 : DS_1A1D_PERMUTE <"ds_bpermute_fi_b32",
int_amdgcn_ds_bpermute_fi_b32>;
-multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
- ValueType vt, string frag> {
- def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_local_addrspace")>;
- def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_noret_local_addrspace"), /* complexity */ 1>;
-}
-
-defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
} // let SubtargetPredicate = isGFX12Plus
let SubtargetPredicate = isGFX1250Plus in {
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 0e60c73aa5db7..9e38af91c7ccf 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1562,10 +1562,6 @@ multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType
}
}
-multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix,
- ValueType vt> :
- FlatAtomicNoRtnPatBase<inst, node # "_noret_" # addrSpaceSuffix, vt, vt>;
-
multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>;
@@ -1590,10 +1586,6 @@ multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt,
}
}
-multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix,
- ValueType vt> :
- FlatAtomicRtnPatBase<inst, intr # "_" # addrSpaceSuffix, vt, vt>;
-
multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt), vt, data_vt>;
@@ -2189,9 +2181,6 @@ let SubtargetPredicate = HasAtomicCondSubClampFlatInsts in {
defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
-defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-
let OtherPredicates = [HasD16LoadStore] in {
defm : FlatStorePats <FLAT_STORE_SHORT_D16_HI, truncstorei16_hi16_flat, i32>;
defm : FlatStorePats <FLAT_STORE_BYTE_D16_HI, truncstorei8_hi16_flat, i32>;
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 3c4f115d7251a..6bb9c412d3697 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -2193,6 +2193,14 @@ R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// FIXME: Cayman at least appears to have instructions for this, but the
// instruction defintions appear to be missing.
return AtomicExpansionKind::CmpXChg;
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
+ if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
+ unsigned Size = IntTy->getBitWidth();
+ if (Size == 32)
+ return AtomicExpansionKind::None;
+ }
+ return AtomicExpansionKind::CmpXChg;
case AtomicRMWInst::Xchg: {
const DataLayout &DL = RMW->getFunction()->getDataLayout();
unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ff67fd63ea75e..1557bc57fda5c 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1521,15 +1521,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
return true;
}
- case Intrinsic::amdgcn_global_atomic_csub: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getType());
- Info.ptrVal = CI.getOperand(0);
- Info.align.reset();
- Info.flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore |
- MachineMemOperand::MOVolatile;
- return true;
- }
case Intrinsic::amdgcn_image_bvh_dual_intersect_ray:
case Intrinsic::amdgcn_image_bvh_intersect_ray:
case Intrinsic::amdgcn_image_bvh8_intersect_ray: {
@@ -1550,8 +1541,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
- case Intrinsic::amdgcn_flat_atomic_fmax_num:
- case Intrinsic::amdgcn_atomic_cond_sub_u32: {
+ case Intrinsic::amdgcn_flat_atomic_fmax_num: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
@@ -1726,7 +1716,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
Type *&AccessTy) const {
Value *Ptr = nullptr;
switch (II->getIntrinsicID()) {
- case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_cluster_load_b128:
case Intrinsic::amdgcn_cluster_load_b64:
case Intrinsic::amdgcn_cluster_load_b32:
@@ -1749,7 +1738,6 @@ bool SITargetLowering::getAddrModeArguments(const IntrinsicInst *II,
case Intrinsic::amdgcn_flat_load_monitor_b128:
case Intrinsic::amdgcn_flat_load_monitor_b32:
case Intrinsic::amdgcn_flat_load_monitor_b64:
- case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fmin_num:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
@@ -10622,21 +10610,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
- case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
- case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
- return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
- case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
- case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
- return lowerStructBufferAtomicIntrin(Op, DAG,
- AMDGPUISD::BUFFER_ATOMIC_CSUB);
- case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
- case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
- return lowerRawBufferAtomicIntrin(Op, DAG,
- AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
- case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
- case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
- return lowerStructBufferAtomicIntrin(Op, DAG,
- AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
@@ -18372,7 +18345,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
case AMDGPUISD::BUFFER_ATOMIC_INC:
case AMDGPUISD::BUFFER_ATOMIC_DEC:
case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
- case AMDGPUISD::BUFFER_ATOMIC_CSUB:
case AMDGPUISD::BUFFER_ATOMIC_FADD:
case AMDGPUISD::BUFFER_ATOMIC_FMIN:
case AMDGPUISD::BUFFER_ATOMIC_FMAX:
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
index 15355ea139205..d9e51c39c2042 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
@@ -15,62 +15,5 @@ define amdgpu_kernel void @test2(ptr %ptr, i32 %cmp, i32 %new) {
ret void
}
-; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val)
-define amdgpu_kernel void @test_atomic_csub_i32(ptr addrspace(1) %ptr, i32 %val) #0 {
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val)
- store i32 %ret, ptr addrspace(1) %ptr, align 4
- ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
-define amdgpu_kernel void @test_ds_atomic_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) #0 {
-entry:
- %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
- store i32 %val, ptr addrspace(3) %use
- ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
-define amdgpu_kernel void @test_flat_atomic_cond_sub_u32(ptr %addr, i32 %in, ptr %use) #0 {
-entry:
- %gep = getelementptr i32, ptr %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
- store i32 %val, ptr %use
- ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
-define amdgpu_kernel void @test_global_atomic_cond_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) #0 {
-entry:
- %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
- store i32 %val, ptr addrspace(1) %use
- ret void
-}
-
-; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
-define float @test_raw_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-entry:
- %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
-define float @test_struct_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-entry:
- %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32) #1
-declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #1
-declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #1
-
attributes #0 = { nounwind }
attributes #1 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll
index 3e28cd050fc88..e9194ea1e14fe 100644
--- a/llvm/test/Bitcode/amdgcn-atomic.ll
+++ b/llvm/test/Bitcode/amdgcn-atomic.ll
@@ -420,5 +420,152 @@ define double @upgrade_amdgcn_global_atomic_fmax_f64_p1_f64(ptr addrspace(1) %pt
attributes #0 = { argmemonly nounwind willreturn }
+define void @atomic_usub_cond(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+ ; CHECK: atomicrmw usub_cond ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result3 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result4 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(3) %ptr3, i64 46 syncscope("agent") seq_cst, align 8
+ %result5 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p3(ptr addrspace(3) %ptr3, i64 46, i64 0, i64 0, i1 false)
+ ret void
+}
+
+define void @atomic_usub_sat(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.csub.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result3 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result4 = call i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr addrspace(3) %ptr3, i64 46 syncscope("agent") seq_cst, align 8
+ %result5 = call i64 @llvm.amdgcn.atomic.csub.i64.p3(ptr addrspace(3) %ptr3, i64 46, i64 0, i64 0, i1 false)
+ ret void
+}
+
+; Test some invalid ordering handling
+define void @ordering_usub_cond_usub_sat(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+ ; CHECK: atomicrmw volatile usub_cond ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr %ptr0, i32 42, i32 -1, i32 0, i1 true)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 true)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 1, i32 0, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") monotonic, align 4
+ %result3 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 2, i32 0, i1 true)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result4 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result5 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 4, i1 true)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result6 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 5, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result7 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 6, i1 true)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result8 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 7, i1 false)
+
+ ; CHECK:= atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result9 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 8, i1 true)
+
+ ; CHECK:= atomicrmw volatile usub_sat ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result10 = call i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 true)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result11 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr %ptr0, i64 42, i64 -1, i64 0, i1 true)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result12 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 true)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result13 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 1, i64 0, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") monotonic, align 8
+ %result14 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 2, i64 0, i1 true)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result15 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 3, i64 0, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result16 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 4, i1 true)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result17 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 5, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result18 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 6, i1 true)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result19 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 7, i1 false)
+
+ ; CHECK:= atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result20 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 8, i1 true)
+
+ ; CHECK:= atomicrmw volatile usub_sat ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result21 = call i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 3, i64 0, i1 true)
+ ret void
+}
+
+define void @immarg_violations_usub_sat(ptr %ptr0, i32 %val32, i1 %val1, i64 %val64) {
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 %val32, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 2, i32 %val32, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 2, i32 0, i1 %val1)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result3 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 %val64, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") monotonic, align 8
+ %result4 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 2, i64 %val64, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") monotonic, align 8
+ %result5 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 2, i64 0, i1 %val1)
+ ret void
+}
+
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p3(ptr addrspace(3) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+
+declare i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.csub.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p3(ptr addrspace(3) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+
; CHECK: !0 = !{i32 5, i32 6}
; CHECK: !1 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
deleted file mode 100644
index bff4771319454..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ /dev/null
@@ -1,270 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
-
-define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst, !amdgpu.no.remote.memory !0
- ret i32 %ret
-}
-
-define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
- ret i32 %ret
-}
-
-define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst, !amdgpu.no.remote.memory !0
- ret void
-}
-
-define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
-; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: global_store_dword v[0:1], v0, off
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: global_store_b32 v[0:1], v0, off
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: global_store_b32 v[0:1], v0, off
-; GFX12-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
- store i32 %ret, ptr addrspace(1) poison
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: buffer_gl1_inv
-; GFX10-NEXT: buffer_gl0_inv
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
-; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: buffer_gl1_inv
-; GFX11-NEXT: buffer_gl0_inv
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_storecnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst, !amdgpu.no.remote.memory !0
- ret void
-}
-
-attributes #0 = { nounwind willreturn }
-attributes #1 = { argmemonly nounwind }
-
-!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll
new file mode 100644
index 0000000000000..1aca8fbe7323c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll
@@ -0,0 +1,197 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-GISEL
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-SDAG
+
+define i32 @global_atomic_usub_cond(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-GISEL-LABEL: global_atomic_usub_cond:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_cond:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
+ ret i32 %ret
+}
+
+define i32 @global_atomic_usub_cond_offset(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret i32 %ret
+}
+
+define void @global_atomic_usub_cond_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define void @global_atomic_usub_cond_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_offset_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_offset_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_sgpr_base_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v0, off
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_sgpr_base_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ store i32 %ret, ptr addrspace(1) undef
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_sgpr_base_offset_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_sgpr_base_offset_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+attributes #0 = { nounwind willreturn }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_sub_clamp.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_sub_clamp.ll
new file mode 100644
index 0000000000000..7ee3d0f870729
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw_sub_clamp.ll
@@ -0,0 +1,495 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10-GISEL
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10-GISEL
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11-GISEL
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-GISEL
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10-SDAG
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10-SDAG
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11-SDAG
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-SDAG
+
+define i32 @global_atomic_usub_sat(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
+ ret i32 %ret
+}
+
+define i32 @global_atomic_usub_sat_offset(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_offset:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_offset:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_offset:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_offset:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret i32 %ret
+}
+
+define void @global_atomic_usub_sat_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_nortn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_nortn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_nortn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_nortn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define void @global_atomic_usub_sat_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_sat_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x1000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: global_store_dword v[0:1], v0, off
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v0, off
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v0, off
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0x1000
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v0, v1, s[0:1] glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: global_store_dword v[0:1], v0, off
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x1000 :: v_dual_mov_b32 v1, s4
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v0, v1, s[0:1] glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v0, off
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ store i32 %ret, ptr addrspace(1) undef
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_sat_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x1000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0x1000
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v0, v1, s[0:1] glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x1000 :: v_dual_mov_b32 v1, s4
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v0, v1, s[0:1] glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+attributes #0 = { nounwind willreturn }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index 4af2d58b01518..863bac80243c4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -2,60 +2,68 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISEL %s
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32)
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32)
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32)
-
-define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32(ptr %addr, i32 %in) {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
- %gep = getelementptr inbounds i32, ptr %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+ %gep = getelementptr i32, ptr %addr, i32 -4
+ %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst
ret void
}
-define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
- %gep = getelementptr inbounds i32, ptr %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+ %gep = getelementptr i32, ptr %addr, i32 -4
+ %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst
ret void
}
-define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr %use) {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32:
+define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr %use) {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
@@ -66,10 +74,12 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
@@ -80,122 +90,127 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2
; GFX12-GISEL-NEXT: s_endpgm
entry:
- %gep = getelementptr inbounds i32, ptr %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+ %gep = getelementptr i32, ptr %addr, i32 4
+ %val = atomicrmw usub_cond ptr %gep, i32 %in seq_cst
store i32 %val, ptr %use
ret void
}
-define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+ %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst
ret void
}
-define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+ %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst
ret void
}
-define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32:
+define amdgpu_kernel void @global_atomic_usub_cond_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 16
-; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0
-; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+ %val = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %use
ret void
}
-define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
-; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @ds_usub_cond_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
+; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
-; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
+; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
@@ -206,61 +221,71 @@ define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %i
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+ %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst
ret void
}
-define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @ds_usub_cond_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+ %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst
ret void
}
-define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
-; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32:
+define amdgpu_kernel void @ds_usub_cond_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
+; GFX12-SDAG-LABEL: ds_usub_cond_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX12-SDAG-NEXT: ds_store_b32 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: ds_usub_cond_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX12-GISEL-NEXT: ds_store_b32 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(3) %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+ %val = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst
store i32 %val, ptr addrspace(3) %use
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
deleted file mode 100644
index 243cd59c6a821..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
+++ /dev/null
@@ -1,219 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
-
-define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
- ret void
-}
-
-define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return_forced:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
- ret void
-}
-
-define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
- ret void
-}
-
-define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return_forced:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s16
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
- ret void
-}
-
-define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
- ret void
-}
-
-define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return_forced:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
- ret void
-}
-
-define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s16
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
- ret void
-}
-
-define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return_forced:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s16
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
- ret void
-}
-
-declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #0
-declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind "target-features"="+atomic-csub-no-rtn-insts" }
-
>From df8fbdb4cba4de2806604b91861427abc1cd213d Mon Sep 17 00:00:00 2001
From: Andrew Jenner <Andrew.Jenner at amd.com>
Date: Tue, 9 Dec 2025 11:18:42 -0500
Subject: [PATCH 2/2] Fix fallout from rebasing.
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 10 +
.../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 10 -
.../Target/AMDGPU/AMDGPUSearchableTables.td | 4 +
llvm/lib/Target/AMDGPU/BUFInstructions.td | 52 +-
llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 8 -
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 15 +
.../test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll | 197 -------
.../CodeGen/AMDGPU/atomicrmw_sub_clamp.ll | 495 ------------------
llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 121 +++--
10 files changed, 124 insertions(+), 796 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/atomicrmw_sub_clamp.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index d57b15b745d79..ea0c811f986b3 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1702,6 +1702,8 @@ def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_sub_clamp_u32 : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
@@ -1738,6 +1740,8 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32 : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
@@ -1778,6 +1782,8 @@ def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
+def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic;
+def int_amdgcn_struct_buffer_atomic_sub_clamp_u32 : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
@@ -1813,6 +1819,8 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic;
+def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic;
+def int_amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32 : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index a9749cf2e33e4..6e3a1b6a5563f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6568,6 +6568,16 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
+ case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
default:
llvm_unreachable("unhandled atomic opcode");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 3205b038112ef..97e7a239e1099 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1783,16 +1783,6 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
"wrapping increment/decrement not supported for "
"buffer resources and should've been expanded away");
break;
- case AtomicRMWInst::USubCond:
- reportFatalUsageError(
- "conditional subtract not supported for buffer "
- "resources and should've been expanded away");
- break;
- case AtomicRMWInst::USubSat:
- reportFatalUsageError(
- "subtract with clamp not supported for buffer "
- "resources and should've been expanded away");
- break;
case AtomicRMWInst::BAD_BINOP:
llvm_unreachable("Not sure how we got a bad binop");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 1fb8b9f6238e5..14984456861cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -258,6 +258,7 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_sub>;
@@ -274,6 +275,7 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_sub>;
@@ -290,6 +292,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_sub>;
@@ -306,6 +309,7 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_live_mask>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 82a5769705d54..bb0e9380e956d 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -783,37 +783,20 @@ multiclass MUBUF_Pseudo_Atomics_NO_RTN <string opName,
multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
RegisterOperand vdataClass,
- ValueType vdataType,
- SDPatternOperator atomic> {
+ ValueType vdataType> {
let FPAtomic = vdataType.isFP in {
- def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0,
- [(set vdataType:$vdata,
- (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
- vdataType:$vdata_in))]>,
- MUBUFAddr64Table <0, NAME # "_RTN">;
-
- def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0,
- [(set vdataType:$vdata,
- (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
- vdataType:$vdata_in))]>,
- MUBUFAddr64Table <1, NAME # "_RTN">;
-
+ def _OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Offset, vdataClass, 0>,
+ MUBUFAddr64Table <0, NAME # "_RTN">;
+ def _ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.Addr64, vdataClass, 0>,
+ MUBUFAddr64Table <1, NAME # "_RTN">;
def _OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.OffEn, vdataClass, 0>;
def _IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.IdxEn, vdataClass, 0>;
def _BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName, BUFAddrKind.BothEn, vdataClass, 0>;
- def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1,
- [(set vdataType:$vdata,
- (atomic (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset),
- vdataType:$vdata_in))]>,
- MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">;
-
- def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1,
- [(set vdataType:$vdata,
- (atomic (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
- vdataType:$vdata_in))]>,
- MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">;
-
+ def _VBUFFER_OFFSET_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Offset, vdataClass, 1>,
+ MUBUFAddr64Table <0, NAME # "_VBUFFER_RTN">;
+ def _VBUFFER_ADDR64_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.Addr64, vdataClass, 1>,
+ MUBUFAddr64Table <1, NAME # "_VBUFFER_RTN">;
def _VBUFFER_OFFEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.OffEn, vdataClass, 1>;
def _VBUFFER_IDXEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.IdxEn, vdataClass, 1>;
def _VBUFFER_BOTHEN_RTN : MUBUF_AtomicRet_Pseudo <opName #_vbuffer, BUFAddrKind.BothEn, vdataClass, 1>;
@@ -822,10 +805,9 @@ multiclass MUBUF_Pseudo_Atomics_RTN <string opName,
multiclass MUBUF_Pseudo_Atomics <string opName,
RegisterOperand vdataClass,
- ValueType vdataType,
- SDPatternOperator atomic = null_frag> :
+ ValueType vdataType> :
MUBUF_Pseudo_Atomics_NO_RTN<opName, vdataClass, vdataType>,
- MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType, atomic>;
+ MUBUF_Pseudo_Atomics_RTN<opName, vdataClass, vdataType>;
//===----------------------------------------------------------------------===//
@@ -1117,22 +1099,22 @@ def BUFFER_WBINVL1_SC : MUBUF_Invalidate <"buffer_wbinvl1_sc",
let SubtargetPredicate = isGFX6GFX7GFX10Plus in {
defm BUFFER_ATOMIC_FCMPSWAP : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fcmpswap", AVLdSt_64, v2f32, null_frag
+ "buffer_atomic_fcmpswap", AVLdSt_64, v2f32
>;
}
let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts in {
defm BUFFER_ATOMIC_FMIN : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fmin", AVLdSt_32, f32, null_frag
+ "buffer_atomic_fmin", AVLdSt_32, f32
>;
defm BUFFER_ATOMIC_FMAX : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fmax", AVLdSt_32, f32, null_frag
+ "buffer_atomic_fmax", AVLdSt_32, f32
>;
}
let SubtargetPredicate = isGFX6GFX7GFX10 in {
defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Pseudo_Atomics <
- "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64, null_frag
+ "buffer_atomic_fcmpswap_x2", VGPROp_128, v2f64
>;
}
@@ -1201,12 +1183,12 @@ defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_NO_RTN <
let SubtargetPredicate = HasAtomicFaddRtnInsts in
defm BUFFER_ATOMIC_ADD_F32 : MUBUF_Pseudo_Atomics_RTN<
- "buffer_atomic_add_f32", AVLdSt_32, f32, null_frag
+ "buffer_atomic_add_f32", AVLdSt_32, f32
>;
let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
defm BUFFER_ATOMIC_PK_ADD_F16 : MUBUF_Pseudo_Atomics_RTN <
- "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16, null_frag
+ "buffer_atomic_pk_add_f16", AVLdSt_32, v2f16
>;
let SubtargetPredicate = isGFX12Plus in {
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 6bb9c412d3697..3c4f115d7251a 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -2193,14 +2193,6 @@ R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// FIXME: Cayman at least appears to have instructions for this, but the
// instruction defintions appear to be missing.
return AtomicExpansionKind::CmpXChg;
- case AtomicRMWInst::USubCond:
- case AtomicRMWInst::USubSat:
- if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
- unsigned Size = IntTy->getBitWidth();
- if (Size == 32)
- return AtomicExpansionKind::None;
- }
- return AtomicExpansionKind::CmpXChg;
case AtomicRMWInst::Xchg: {
const DataLayout &DL = RMW->getFunction()->getDataLayout();
unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1557bc57fda5c..df953894e47be 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10610,6 +10610,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_CSUB);
+ case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
+ return lowerRawBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
+ case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll
deleted file mode 100644
index 1aca8fbe7323c..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll
+++ /dev/null
@@ -1,197 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-GISEL
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-SDAG
-
-define i32 @global_atomic_usub_cond(ptr addrspace(1) %ptr, i32 %data) {
-; GFX12-GISEL-LABEL: global_atomic_usub_cond:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_atomic_usub_cond:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
- %ret = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
- ret i32 %ret
-}
-
-define i32 @global_atomic_usub_cond_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX12-GISEL-LABEL: global_atomic_usub_cond_offset:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_atomic_usub_cond_offset:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
- ret i32 %ret
-}
-
-define void @global_atomic_usub_cond_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX12-GISEL-LABEL: global_atomic_usub_cond_nortn:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_atomic_usub_cond_nortn:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
- %ret = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
- ret void
-}
-
-define void @global_atomic_usub_cond_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX12-GISEL-LABEL: global_atomic_usub_cond_offset_nortn:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_atomic_usub_cond_offset_nortn:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX12-GISEL-LABEL: global_atomic_usub_cond_sgpr_base_offset:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_nop 0
-; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-NEXT: s_endpgm
-;
-; GFX12-SDAG-LABEL: global_atomic_usub_cond_sgpr_base_offset:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v0, off
-; GFX12-SDAG-NEXT: s_nop 0
-; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
- store i32 %ret, ptr addrspace(1) undef
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX12-GISEL-LABEL: global_atomic_usub_cond_sgpr_base_offset_nortn:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_endpgm
-;
-; GFX12-SDAG-LABEL: global_atomic_usub_cond_sgpr_base_offset_nortn:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
- ret void
-}
-
-attributes #0 = { nounwind willreturn }
-attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_sub_clamp.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_sub_clamp.ll
deleted file mode 100644
index 7ee3d0f870729..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw_sub_clamp.ll
+++ /dev/null
@@ -1,495 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10-GISEL
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10-GISEL
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11-GISEL
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-GISEL
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10-SDAG
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10-SDAG
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11-SDAG
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-SDAG
-
-define i32 @global_atomic_usub_sat(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-GISEL-LABEL: global_atomic_usub_sat:
-; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT: buffer_gl1_inv
-; GFX10-GISEL-NEXT: buffer_gl0_inv
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: global_atomic_usub_sat:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: buffer_gl1_inv
-; GFX11-GISEL-NEXT: buffer_gl0_inv
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_atomic_usub_sat:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: global_atomic_usub_sat:
-; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT: buffer_gl1_inv
-; GFX10-SDAG-NEXT: buffer_gl0_inv
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: global_atomic_usub_sat:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: buffer_gl1_inv
-; GFX11-SDAG-NEXT: buffer_gl0_inv
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_atomic_usub_sat:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
- %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
- ret i32 %ret
-}
-
-define i32 @global_atomic_usub_sat_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-GISEL-LABEL: global_atomic_usub_sat_offset:
-; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT: buffer_gl1_inv
-; GFX10-GISEL-NEXT: buffer_gl0_inv
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: global_atomic_usub_sat_offset:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: buffer_gl1_inv
-; GFX11-GISEL-NEXT: buffer_gl0_inv
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_atomic_usub_sat_offset:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: global_atomic_usub_sat_offset:
-; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT: buffer_gl1_inv
-; GFX10-SDAG-NEXT: buffer_gl0_inv
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: global_atomic_usub_sat_offset:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: buffer_gl1_inv
-; GFX11-SDAG-NEXT: buffer_gl0_inv
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_atomic_usub_sat_offset:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
- ret i32 %ret
-}
-
-define void @global_atomic_usub_sat_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-GISEL-LABEL: global_atomic_usub_sat_nortn:
-; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT: buffer_gl1_inv
-; GFX10-GISEL-NEXT: buffer_gl0_inv
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: global_atomic_usub_sat_nortn:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: buffer_gl1_inv
-; GFX11-GISEL-NEXT: buffer_gl0_inv
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_atomic_usub_sat_nortn:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: global_atomic_usub_sat_nortn:
-; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT: buffer_gl1_inv
-; GFX10-SDAG-NEXT: buffer_gl0_inv
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: global_atomic_usub_sat_nortn:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: buffer_gl1_inv
-; GFX11-SDAG-NEXT: buffer_gl0_inv
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_atomic_usub_sat_nortn:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
- %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
- ret void
-}
-
-define void @global_atomic_usub_sat_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-GISEL-LABEL: global_atomic_usub_sat_offset_nortn:
-; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT: buffer_gl1_inv
-; GFX10-GISEL-NEXT: buffer_gl0_inv
-; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-GISEL-LABEL: global_atomic_usub_sat_offset_nortn:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: buffer_gl1_inv
-; GFX11-GISEL-NEXT: buffer_gl0_inv
-; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-GISEL-LABEL: global_atomic_usub_sat_offset_nortn:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
-; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX10-SDAG-LABEL: global_atomic_usub_sat_offset_nortn:
-; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT: buffer_gl1_inv
-; GFX10-SDAG-NEXT: buffer_gl0_inv
-; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-SDAG-LABEL: global_atomic_usub_sat_offset_nortn:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
-; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: buffer_gl1_inv
-; GFX11-SDAG-NEXT: buffer_gl0_inv
-; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-SDAG-LABEL: global_atomic_usub_sat_offset_nortn:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
-; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_usub_sat_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset:
-; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
-; GFX10-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x1000
-; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT: buffer_gl1_inv
-; GFX10-GISEL-NEXT: buffer_gl0_inv
-; GFX10-GISEL-NEXT: global_store_dword v[0:1], v0, off
-; GFX10-GISEL-NEXT: s_endpgm
-;
-; GFX11-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4
-; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: buffer_gl1_inv
-; GFX11-GISEL-NEXT: buffer_gl0_inv
-; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v0, off
-; GFX11-GISEL-NEXT: s_nop 0
-; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v0, off
-; GFX12-GISEL-NEXT: s_nop 0
-; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-NEXT: s_endpgm
-;
-; GFX10-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset:
-; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x1
-; GFX10-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0x1000
-; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-SDAG-NEXT: global_atomic_csub v0, v0, v1, s[0:1] glc
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT: buffer_gl1_inv
-; GFX10-SDAG-NEXT: buffer_gl0_inv
-; GFX10-SDAG-NEXT: global_store_dword v[0:1], v0, off
-; GFX10-SDAG-NEXT: s_endpgm
-;
-; GFX11-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x1000 :: v_dual_mov_b32 v1, s4
-; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v0, v1, s[0:1] glc
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: buffer_gl1_inv
-; GFX11-SDAG-NEXT: buffer_gl0_inv
-; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v0, off
-; GFX11-SDAG-NEXT: s_nop 0
-; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-SDAG-NEXT: s_endpgm
-;
-; GFX12-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v0, off
-; GFX12-SDAG-NEXT: s_nop 0
-; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
- store i32 %ret, ptr addrspace(1) undef
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_usub_sat_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
-; GFX10-GISEL: ; %bb.0:
-; GFX10-GISEL-NEXT: s_clause 0x1
-; GFX10-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8
-; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x1000
-; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-GISEL-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
-; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX10-GISEL-NEXT: buffer_gl1_inv
-; GFX10-GISEL-NEXT: buffer_gl0_inv
-; GFX10-GISEL-NEXT: s_endpgm
-;
-; GFX11-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
-; GFX11-GISEL: ; %bb.0:
-; GFX11-GISEL-NEXT: s_clause 0x1
-; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8
-; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4
-; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
-; GFX11-GISEL-NEXT: buffer_gl1_inv
-; GFX11-GISEL-NEXT: buffer_gl0_inv
-; GFX11-GISEL-NEXT: s_endpgm
-;
-; GFX12-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
-; GFX12-GISEL: ; %bb.0:
-; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
-; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_endpgm
-;
-; GFX10-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
-; GFX10-SDAG: ; %bb.0:
-; GFX10-SDAG-NEXT: s_clause 0x1
-; GFX10-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8
-; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0x1000
-; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX10-SDAG-NEXT: global_atomic_csub v0, v0, v1, s[0:1] glc
-; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX10-SDAG-NEXT: buffer_gl1_inv
-; GFX10-SDAG-NEXT: buffer_gl0_inv
-; GFX10-SDAG-NEXT: s_endpgm
-;
-; GFX11-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
-; GFX11-SDAG: ; %bb.0:
-; GFX11-SDAG-NEXT: s_clause 0x1
-; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8
-; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x1000 :: v_dual_mov_b32 v1, s4
-; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v0, v1, s[0:1] glc
-; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
-; GFX11-SDAG-NEXT: buffer_gl1_inv
-; GFX11-SDAG-NEXT: buffer_gl0_inv
-; GFX11-SDAG-NEXT: s_endpgm
-;
-; GFX12-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
-; GFX12-SDAG: ; %bb.0:
-; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
-; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
- ret void
-}
-
-attributes #0 = { nounwind willreturn }
-attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index 863bac80243c4..d281492c647f1 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -7,11 +7,13 @@ define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32(ptr %addr, i32 %in)
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
@@ -19,16 +21,20 @@ define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32(ptr %addr, i32 %in)
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %addr, i32 -4
- %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst
+ %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -37,10 +43,12 @@ define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32_forced(ptr %addr, i3
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], -16
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
@@ -49,16 +57,20 @@ define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32_forced(ptr %addr, i3
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, -1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %addr, i32 -4
- %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst
+ %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -69,13 +81,15 @@ define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 16
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2
; GFX12-SDAG-NEXT: s_endpgm
;
@@ -85,18 +99,22 @@ define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, 16
+; GFX12-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %addr, i32 4
- %val = atomicrmw usub_cond ptr %gep, i32 %in seq_cst
+ %val = atomicrmw usub_cond ptr %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr %use
ret void
}
@@ -108,8 +126,9 @@ define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32(ptr addrspace(1) %
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
@@ -119,13 +138,14 @@ define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32(ptr addrspace(1) %
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
- %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst
+ %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -136,6 +156,7 @@ define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32_forced(ptr addrspa
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
@@ -147,31 +168,31 @@ define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32_forced(ptr addrspa
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
- %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst
+ %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
ret void
}
define amdgpu_kernel void @global_atomic_usub_cond_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
; GFX12-SDAG-LABEL: global_atomic_usub_cond_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT: s_nop 0
-; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_atomic_usub_cond_rtn_u32:
@@ -180,18 +201,17 @@ define amdgpu_kernel void @global_atomic_usub_cond_rtn_u32(ptr addrspace(1) %add
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT: s_nop 0
-; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %addr, i32 4
- %val = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst
+ %val = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(1) %use
ret void
}
@@ -204,8 +224,7 @@ define amdgpu_kernel void @ds_usub_cond_no_rtn_u32(ptr addrspace(3) %addr, i32 %
; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
-; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1
+; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_endpgm
@@ -218,10 +237,12 @@ define amdgpu_kernel void @ds_usub_cond_no_rtn_u32(ptr addrspace(3) %addr, i32 %
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4
- %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst
+ %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -233,7 +254,6 @@ define amdgpu_kernel void @ds_usub_cond_no_rtn_u32_forced(ptr addrspace(3) %addr
; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
@@ -246,14 +266,13 @@ define amdgpu_kernel void @ds_usub_cond_no_rtn_u32_forced(ptr addrspace(3) %addr
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
-; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4
- %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst
+ %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
ret void
}
@@ -263,7 +282,6 @@ define amdgpu_kernel void @ds_usub_cond_rtn_u32(ptr addrspace(3) %addr, i32 %in,
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
@@ -276,7 +294,6 @@ define amdgpu_kernel void @ds_usub_cond_rtn_u32(ptr addrspace(3) %addr, i32 %in,
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
-; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
@@ -285,7 +302,9 @@ define amdgpu_kernel void @ds_usub_cond_rtn_u32(ptr addrspace(3) %addr, i32 %in,
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(3) %addr, i32 4
- %val = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst
+ %val = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst, !amdgpu.no.remote.memory !0
store i32 %val, ptr addrspace(3) %use
ret void
}
+
+!0 = !{}
More information about the llvm-commits
mailing list