[llvm] AMDGPU: Drop and upgrade llvm.amdgcn.atomic.csub/cond.sub to atomicrmw (PR #105553)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Sep 9 07:42:23 PDT 2024
https://github.com/anjenner updated https://github.com/llvm/llvm-project/pull/105553
>From 37a1e1cb21afc20e0892c7623f1b2f88af9720ec Mon Sep 17 00:00:00 2001
From: Andrew Jenner <Andrew.Jenner at amd.com>
Date: Thu, 15 Aug 2024 10:00:29 -0400
Subject: [PATCH 1/2] AMDGPU: Drop and upgrade llvm.amdgcn.atomic.csub/cond.sub
to atomicrmw
---
llvm/docs/AMDGPUUsage.rst | 5 -
llvm/docs/ReleaseNotes.rst | 4 +
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 8 -
llvm/lib/IR/AutoUpgrade.cpp | 11 +-
llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 2 +
.../AMDGPU/AMDGPUInstructionSelector.cpp | 2 +
llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 6 +-
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 10 +-
.../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 10 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 4 +-
.../Target/AMDGPU/AMDGPUSearchableTables.td | 6 -
llvm/lib/Target/AMDGPU/BUFInstructions.td | 2 +-
llvm/lib/Target/AMDGPU/DSInstructions.td | 46 +-
llvm/lib/Target/AMDGPU/FLATInstructions.td | 23 +-
llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 8 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 35 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 +
.../AMDGPU/MIR/atomics-gmir.mir | 6 +
.../UniformityAnalysis/AMDGPU/atomics.ll | 57 --
llvm/test/Bitcode/amdgcn-atomic.ll | 147 ++++
.../llvm.amdgcn.global.atomic.csub.ll | 215 -----
.../test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll | 197 +++++
.../CodeGen/AMDGPU/atomicrmw_sub_clamp.ll | 495 +++++++++++
llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 160 ++--
.../AMDGPU/cgp-addressing-modes-gfx1030.ll | 12 +-
.../AMDGPU/global-saddr-atomics.gfx1030.ll | 18 +-
.../AMDGPU/llvm.amdgcn.atomic.cond.sub.ll | 225 -----
.../AMDGPU/llvm.amdgcn.global.atomic.csub.ll | 10 +-
.../CodeGen/AMDGPU/private-memory-atomics.ll | 52 ++
llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll | 2 +-
.../AtomicExpand/AMDGPU/expand-atomic-i16.ll | 358 ++++++++
.../AtomicExpand/AMDGPU/expand-atomic-i8.ll | 798 ++++++++++++++++++
32 files changed, 2277 insertions(+), 659 deletions(-)
delete mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/atomicrmw_sub_clamp.ll
delete mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index ba62a68c4a509e..359f703d6ecc9e 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1360,11 +1360,6 @@ The AMDGPU backend implements the following LLVM IR intrinsics.
The iglp_opt strategy implementations are subject to change.
- llvm.amdgcn.atomic.cond.sub.u32 Provides direct access to flat_atomic_cond_sub_u32, global_atomic_cond_sub_u32
- and ds_cond_sub_u32 based on address space on gfx12 targets. This
- performs subtraction only if the memory value is greater than or
- equal to the data value.
-
llvm.amdgcn.s.getpc Provides access to the s_getpc_b64 instruction, but with the return value
sign-extended from the width of the underlying PC hardware register even on
processors where the s_getpc_b64 instruction returns a zero-extended value.
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 52456896f2fc6c..b1d5d5098652cf 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -82,6 +82,10 @@ Changes to the AMDGPU Backend
``llvm.amdgcn.global.atomic.fadd`` intrinsics. Users should use the
:ref:`atomicrmw <i_atomicrmw>` instruction with `fadd` and
addrspace(0) or addrspace(1) instead.
+* Removed ``llvm.amdgcn.atomic.cond.sub.u32`` and
+ ``llvm.amdgcn.atomic.csub.u32`` intrinsics. Users should use the
+ :ref:`atomicrmw <i_atomicrmw>` instruction with ``usub_cond`` and
+ ``usub_sat`` instead.
Changes to the ARM Backend
--------------------------
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a5259ba9eec36e..e4a99e71076803 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1353,7 +1353,6 @@ def int_amdgcn_raw_buffer_atomic_or : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
-def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
@@ -1390,7 +1389,6 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
-def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
@@ -1431,7 +1429,6 @@ def int_amdgcn_struct_buffer_atomic_or : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
-def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
@@ -1467,7 +1464,6 @@ def int_amdgcn_struct_ptr_buffer_atomic_or : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic;
-def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
@@ -2463,8 +2459,6 @@ class AMDGPUAtomicRtn<LLVMType vt, LLVMType pt = llvm_anyptr_ty> : Intrinsic <
[IntrArgMemOnly, IntrWillReturn, NoCapture<ArgIndex<0>>, IntrNoCallback, IntrNoFree], "",
[SDNPMemOperand]>;
-def int_amdgcn_global_atomic_csub : AMDGPUAtomicRtn<llvm_i32_ty>;
-
// uint4 llvm.amdgcn.image.bvh.intersect.ray <node_ptr>, <ray_extent>, <ray_origin>,
// <ray_dir>, <ray_inv_dir>, <texture_descr>
// <node_ptr> is i32 or i64.
@@ -2664,8 +2658,6 @@ def int_amdgcn_flat_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_global_atomic_fmin_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
def int_amdgcn_global_atomic_fmax_num : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
-def int_amdgcn_atomic_cond_sub_u32 : AMDGPUAtomicRtn<llvm_i32_ty>;
-
class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
Intrinsic<
[llvm_any_ty],
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 69dae5e32dbbe8..a0205b40f00f61 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1024,9 +1024,10 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
}
if (Name.consume_front("atomic.")) {
- if (Name.starts_with("inc") || Name.starts_with("dec")) {
- // These were replaced with atomicrmw uinc_wrap and udec_wrap, so
- // there's no new declaration.
+ if (Name.starts_with("inc") || Name.starts_with("dec") ||
+ Name.starts_with("cond.sub") || Name.starts_with("csub")) {
+ // These were replaced with atomicrmw uinc_wrap, udec_wrap, usub_cond
+ // and usub_sat so there's no new declaration.
NewFn = nullptr;
return true;
}
@@ -4053,7 +4054,9 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
.StartsWith("atomic.inc.", AtomicRMWInst::UIncWrap)
.StartsWith("atomic.dec.", AtomicRMWInst::UDecWrap)
.StartsWith("global.atomic.fadd", AtomicRMWInst::FAdd)
- .StartsWith("flat.atomic.fadd", AtomicRMWInst::FAdd);
+ .StartsWith("flat.atomic.fadd", AtomicRMWInst::FAdd)
+ .StartsWith("atomic.cond.sub", AtomicRMWInst::USubCond)
+ .StartsWith("atomic.csub", AtomicRMWInst::USubSat);
unsigned NumOperands = CI->getNumOperands();
if (NumOperands < 3) // Malformed bitcode.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 118271af879937..72eae609e93bce 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -271,6 +271,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
// FIXME: Check MMO is atomic
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_COND, atomic_load_usub_cond_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_SAT, atomic_load_usub_sat_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 9bebd418bb426e..4359c9196f1708 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3536,6 +3536,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UMAX:
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
+ case TargetOpcode::G_ATOMICRMW_USUB_COND:
+ case TargetOpcode::G_ATOMICRMW_USUB_SAT:
case TargetOpcode::G_ATOMICRMW_FADD:
case TargetOpcode::G_ATOMICRMW_FMIN:
case TargetOpcode::G_ATOMICRMW_FMAX:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index aa5b151adef3a4..c6d01662a113b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -622,15 +622,11 @@ defm int_amdgcn_flat_atomic_fmin : noret_op;
defm int_amdgcn_flat_atomic_fmax : noret_op;
defm int_amdgcn_global_atomic_fmin : noret_op;
defm int_amdgcn_global_atomic_fmax : noret_op;
-defm int_amdgcn_global_atomic_csub : noret_op;
defm int_amdgcn_global_atomic_ordered_add_b64 : noret_op;
defm int_amdgcn_flat_atomic_fmin_num : noret_op;
defm int_amdgcn_flat_atomic_fmax_num : noret_op;
defm int_amdgcn_global_atomic_fmin_num : noret_op;
defm int_amdgcn_global_atomic_fmax_num : noret_op;
-defm int_amdgcn_atomic_cond_sub_u32 : local_addr_space_atomic_op;
-defm int_amdgcn_atomic_cond_sub_u32 : flat_addr_space_atomic_op;
-defm int_amdgcn_atomic_cond_sub_u32 : global_addr_space_atomic_op;
multiclass noret_binary_atomic_op<SDNode atomic_op> {
let HasNoUse = true in
@@ -681,6 +677,8 @@ defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
+defm atomic_load_usub_cond : binary_atomic_op_all_as<atomic_load_usub_cond>;
+defm atomic_load_usub_sat : binary_atomic_op_all_as<atomic_load_usub_sat>;
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 3f6486d44f0ee5..1b99718aa355f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1647,6 +1647,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
+ auto &Atomics32 =
+ getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
+ .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
+ if (ST.hasFlatAddressSpace()) {
+ Atomics32.legalFor({{S32, FlatPtr}});
+ }
+
// TODO: v2bf16 operations, and fat buffer pointer support.
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomicAddF32()) {
@@ -6150,9 +6157,6 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
- case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
- case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
- return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
default:
llvm_unreachable("unhandled atomic opcode");
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 77971323aa1ec6..7cdec5a956a49f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1150,7 +1150,15 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
report_fatal_error("wrapping increment/decrement not supported for "
- "buffer resources and should've ben expanded away");
+ "buffer resources and should've been expanded away");
+ break;
+ case AtomicRMWInst::USubCond:
+ report_fatal_error("conditional subtract not supported for buffer "
+ "resources and should've been expanded away");
+ break;
+ case AtomicRMWInst::USubSat:
+ report_fatal_error("subtract with clamp not supported for buffer "
+ "resources and should've been expanded away");
break;
case AtomicRMWInst::BAD_BINOP:
llvm_unreachable("Not sure how we got a bad binop");
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index a2e6842b760f65..e3aedd30214031 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4906,7 +4906,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
break;
}
- case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmin:
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_fmin_num:
@@ -4915,7 +4914,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
case Intrinsic::amdgcn_flat_atomic_fmax_num:
- case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
case Intrinsic::amdgcn_global_load_tr_b64:
case Intrinsic::amdgcn_global_load_tr_b128:
@@ -5247,6 +5245,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_FMAX:
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
+ case AMDGPU::G_ATOMICRMW_USUB_COND:
+ case AMDGPU::G_ATOMICRMW_USUB_SAT:
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 95c4859674ecc4..bb4ecfd749ae40 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -237,8 +237,6 @@ def : SourceOfDivergence<int_amdgcn_mbcnt_lo>;
def : SourceOfDivergence<int_r600_read_tidig_x>;
def : SourceOfDivergence<int_r600_read_tidig_y>;
def : SourceOfDivergence<int_r600_read_tidig_z>;
-def : SourceOfDivergence<int_amdgcn_atomic_cond_sub_u32>;
-def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_global_atomic_fmin_num>;
@@ -264,7 +262,6 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
-def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_sub>;
@@ -281,7 +278,6 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>;
-def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_sub>;
@@ -298,7 +294,6 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
-def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_swap>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_add>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_sub>;
@@ -315,7 +310,6 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>;
def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;
-def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32>;
def : SourceOfDivergence<int_amdgcn_ps_live>;
def : SourceOfDivergence<int_amdgcn_live_mask>;
def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index c6668b24f4ef67..db672d5e1bd800 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1128,7 +1128,7 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
let OtherPredicates = [HasGFX10_BEncoding] in {
defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics <
- "buffer_atomic_csub", VGPR_32, i32, int_amdgcn_global_atomic_csub
+ "buffer_atomic_csub", VGPR_32, i32
>;
}
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index e9283fde85a48d..b1b8d3b6268b59 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -734,17 +734,6 @@ defm DS_COND_SUB_RTN_U32 : DS_1A1D_RET_mc<"ds_cond_sub_rtn_u32", VGPR_32>;
defm DS_SUB_CLAMP_U32 : DS_1A1D_NORET_mc<"ds_sub_clamp_u32">;
defm DS_SUB_CLAMP_RTN_U32 : DS_1A1D_RET_mc<"ds_sub_clamp_rtn_u32", VGPR_32>;
-multiclass DSAtomicRetNoRetPatIntrinsic_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
- ValueType vt, string frag> {
- def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_local_addrspace")>;
-
- let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_noret_local_addrspace"), /* complexity */ 1>;
-}
-
-defm : DSAtomicRetNoRetPatIntrinsic_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "int_amdgcn_atomic_cond_sub_u32">;
} // let SubtargetPredicate = isGFX12Plus
//===----------------------------------------------------------------------===//
@@ -1006,7 +995,34 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
}
}
+multiclass DSAtomicRetNoRetPatCondSub_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
+ ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_local_m0_"#vt)>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
+ }
+
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_"#vt)>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
+ }
+ let OtherPredicates = [HasGDS] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_region_m0_"#vt),
+ /* complexity */ 0, /* gds */ 1>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
+ /* complexity */ 1, /* gds */ 1>;
+ }
+}
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
@@ -1089,6 +1105,14 @@ defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_l
defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
}
+let SubtargetPredicate = isGFX12Plus in {
+
+defm : DSAtomicRetNoRetPatCondSub_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">;
+
+defm : DSAtomicRetNoRetPat_mc<DS_SUB_CLAMP_RTN_U32, DS_SUB_CLAMP_U32, i32, "atomic_load_usub_sat">;
+
+} // let SubtargetPredicate = isGFX12Plus
+
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 7b3822067072e5..9f9c90f3671d3a 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1110,10 +1110,6 @@ multiclass FlatAtomicNoRtnPatBase <string inst, string node, ValueType vt,
(!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}
-multiclass FlatAtomicNoRtnPatWithAddrSpace<string inst, string node, string addrSpaceSuffix,
- ValueType vt> :
- FlatAtomicNoRtnPatBase<inst, node # "_noret_" # addrSpaceSuffix, vt, vt>;
-
multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>;
@@ -1128,10 +1124,6 @@ multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt,
(!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
}
-multiclass FlatAtomicRtnPatWithAddrSpace<string inst, string intr, string addrSpaceSuffix,
- ValueType vt> :
- FlatAtomicRtnPatBase<inst, intr # "_" # addrSpaceSuffix, vt, vt>;
-
multiclass FlatAtomicRtnPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, bit isIntr = 0> :
FlatAtomicRtnPatBase<inst, node # !if(isIntr, "", "_"#vt), vt, data_vt>;
@@ -1438,14 +1430,13 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
}
-} // end foreach as
-
let SubtargetPredicate = isGFX12Plus in {
- defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32 >;
+ defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32 >;
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
+ defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>;
}
+} // end foreach as
def : FlatStorePat <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
def : FlatStorePat <FLAT_STORE_SHORT, store_flat, i16>;
@@ -1560,10 +1551,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
-defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
-defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
@@ -1580,10 +1571,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
let SubtargetPredicate = isGFX12Plus in {
- defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+ defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+ defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
}
let OtherPredicates = [isGFX12Plus] in {
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 7e4d9d21a0b397..5faa98dd85ce62 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -2185,6 +2185,14 @@ R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// FIXME: Cayman at least appears to have instructions for this, but the
// instruction defintions appear to be missing.
return AtomicExpansionKind::CmpXChg;
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
+ if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
+ unsigned Size = IntTy->getBitWidth();
+ if (Size == 32)
+ return AtomicExpansionKind::None;
+ }
+ return AtomicExpansionKind::CmpXChg;
case AtomicRMWInst::Xchg: {
const DataLayout &DL = RMW->getFunction()->getDataLayout();
unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index accc3084217f2b..04eb0be89ed4df 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -950,6 +950,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ATOMIC_LOAD_FMAX,
ISD::ATOMIC_LOAD_UINC_WRAP,
ISD::ATOMIC_LOAD_UDEC_WRAP,
+ ISD::ATOMIC_LOAD_USUB_COND,
+ ISD::ATOMIC_LOAD_USUB_SAT,
ISD::INTRINSIC_VOID,
ISD::INTRINSIC_W_CHAIN});
@@ -1331,16 +1333,6 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return true;
}
- case Intrinsic::amdgcn_global_atomic_csub: {
- Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.memVT = MVT::getVT(CI.getType());
- Info.ptrVal = CI.getOperand(0);
- Info.align.reset();
- Info.flags |= MachineMemOperand::MOLoad |
- MachineMemOperand::MOStore |
- MachineMemOperand::MOVolatile;
- return true;
- }
case Intrinsic::amdgcn_image_bvh_intersect_ray: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType()); // XXX: what is correct VT?
@@ -1359,8 +1351,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmax:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
- case Intrinsic::amdgcn_flat_atomic_fmax_num:
- case Intrinsic::amdgcn_atomic_cond_sub_u32: {
+ case Intrinsic::amdgcn_flat_atomic_fmax_num: {
Info.opc = ISD::INTRINSIC_W_CHAIN;
Info.memVT = MVT::getVT(CI.getType());
Info.ptrVal = CI.getOperand(0);
@@ -1464,7 +1455,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
Type *&AccessTy) const {
Value *Ptr = nullptr;
switch (II->getIntrinsicID()) {
- case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_ds_append:
case Intrinsic::amdgcn_ds_consume:
case Intrinsic::amdgcn_ds_ordered_add:
@@ -1473,7 +1463,6 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
case Intrinsic::amdgcn_flat_atomic_fmax_num:
case Intrinsic::amdgcn_flat_atomic_fmin:
case Intrinsic::amdgcn_flat_atomic_fmin_num:
- case Intrinsic::amdgcn_global_atomic_csub:
case Intrinsic::amdgcn_global_atomic_fmax:
case Intrinsic::amdgcn_global_atomic_fmax_num:
case Intrinsic::amdgcn_global_atomic_fmin:
@@ -9041,9 +9030,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
- case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
- return lowerRawBufferAtomicIntrin(Op, DAG,
- AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
return lowerStructBufferAtomicIntrin(Op, DAG,
@@ -9085,9 +9071,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
- case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
- return lowerStructBufferAtomicIntrin(Op, DAG,
- AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
@@ -15998,7 +15981,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
case AMDGPUISD::BUFFER_ATOMIC_INC:
case AMDGPUISD::BUFFER_ATOMIC_DEC:
case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
- case AMDGPUISD::BUFFER_ATOMIC_CSUB:
case AMDGPUISD::BUFFER_ATOMIC_FADD:
case AMDGPUISD::BUFFER_ATOMIC_FMIN:
case AMDGPUISD::BUFFER_ATOMIC_FMAX:
@@ -16108,10 +16090,10 @@ static bool isV2BF16(Type *Ty) {
}
/// \return true if atomicrmw integer ops work for the type.
-static bool isAtomicRMWLegalIntTy(Type *Ty) {
+static bool isAtomicRMWLegalIntTy(Type *Ty, bool Allow64 = true) {
if (auto *IT = dyn_cast<IntegerType>(Ty)) {
unsigned BW = IT->getBitWidth();
- return BW == 32 || BW == 64;
+ return BW == 32 || (BW == 64 && Allow64);
}
return false;
@@ -16163,8 +16145,8 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
/// \return Action to perform on AtomicRMWInsts for integer operations.
static TargetLowering::AtomicExpansionKind
-atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
- return isAtomicRMWLegalIntTy(RMW->getType())
+atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW, bool Allow64 = true) {
+ return isAtomicRMWLegalIntTy(RMW->getType(), Allow64)
? TargetLowering::AtomicExpansionKind::None
: TargetLowering::AtomicExpansionKind::CmpXChg;
}
@@ -16204,6 +16186,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
return atomicSupportedIfLegalIntType(RMW);
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
+ return atomicSupportedIfLegalIntType(RMW, false);
case AtomicRMWInst::Sub:
case AtomicRMWInst::Or:
case AtomicRMWInst::Xor: {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index faa8ca282e7ab8..de714d8fd9f4da 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -725,6 +725,8 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">;
defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">;
+defm atomic_load_usub_cond : SIAtomicM0Glue2 <"LOAD_USUB_COND">;
+defm atomic_load_usub_sat : SIAtomicM0Glue2 <"LOAD_USUB_SAT">;
defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
index f2ba7f8b219323..86507f9d0e2ffc 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
@@ -81,6 +81,12 @@ body: |
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_UDEC_WRAP
%20:_(s32) = G_ATOMICRMW_UDEC_WRAP %1, %5
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_COND
+ %21:_(s32) = G_ATOMICRMW_USUB_COND %1, %5
+
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_SAT
+ %22:_(s32) = G_ATOMICRMW_USUB_SAT %1, %5
+
$vgpr0 = COPY %4(s32)
SI_RETURN implicit $vgpr0
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
index 15355ea1392053..d9e51c39c2042a 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/atomics.ll
@@ -15,62 +15,5 @@ define amdgpu_kernel void @test2(ptr %ptr, i32 %cmp, i32 %new) {
ret void
}
-; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val)
-define amdgpu_kernel void @test_atomic_csub_i32(ptr addrspace(1) %ptr, i32 %val) #0 {
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %val)
- store i32 %ret, ptr addrspace(1) %ptr, align 4
- ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
-define amdgpu_kernel void @test_ds_atomic_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) #0 {
-entry:
- %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
- store i32 %val, ptr addrspace(3) %use
- ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
-define amdgpu_kernel void @test_flat_atomic_cond_sub_u32(ptr %addr, i32 %in, ptr %use) #0 {
-entry:
- %gep = getelementptr i32, ptr %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
- store i32 %val, ptr %use
- ret void
-}
-
-; CHECK: DIVERGENT: %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
-define amdgpu_kernel void @test_global_atomic_cond_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) #0 {
-entry:
- %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
- store i32 %val, ptr addrspace(1) %use
- ret void
-}
-
-; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
-define float @test_raw_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-entry:
- %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-; CHECK: DIVERGENT: %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
-define float @test_struct_buffer_atomic_cond_sub_u32(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-entry:
- %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32) #1
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32) #1
-declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #1
-declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #1
-
attributes #0 = { nounwind }
attributes #1 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/Bitcode/amdgcn-atomic.ll b/llvm/test/Bitcode/amdgcn-atomic.ll
index d642372799f56b..678a9859f59df3 100644
--- a/llvm/test/Bitcode/amdgcn-atomic.ll
+++ b/llvm/test/Bitcode/amdgcn-atomic.ll
@@ -355,3 +355,150 @@ define float @upgrade_amdgcn_global_atomic_fadd_f32_p1_f32(ptr addrspace(1) %ptr
}
attributes #0 = { argmemonly nounwind willreturn }
+
+define void @atomic_usub_cond(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+ ; CHECK: atomicrmw usub_cond ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result3 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result4 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(3) %ptr3, i64 46 syncscope("agent") seq_cst, align 8
+ %result5 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p3(ptr addrspace(3) %ptr3, i64 46, i64 0, i64 0, i1 false)
+ ret void
+}
+
+define void @atomic_usub_sat(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr addrspace(3) %ptr3, i32 46 syncscope("agent") seq_cst, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.csub.i32.p3(ptr addrspace(3) %ptr3, i32 46, i32 0, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result3 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result4 = call i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr addrspace(3) %ptr3, i64 46 syncscope("agent") seq_cst, align 8
+ %result5 = call i64 @llvm.amdgcn.atomic.csub.i64.p3(ptr addrspace(3) %ptr3, i64 46, i64 0, i64 0, i1 false)
+ ret void
+}
+
+; Test some invalid ordering handling
+define void @ordering_usub_cond_usub_sat(ptr %ptr0, ptr addrspace(1) %ptr1, ptr addrspace(3) %ptr3) {
+ ; CHECK: atomicrmw volatile usub_cond ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr %ptr0, i32 42, i32 -1, i32 0, i1 true)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 0, i32 0, i1 true)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 1, i32 0, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") monotonic, align 4
+ %result3 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 2, i32 0, i1 true)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result4 = call i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result5 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 4, i1 true)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result6 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 5, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result7 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 6, i1 true)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result8 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 7, i1 false)
+
+ ; CHECK:= atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result9 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 0, i32 8, i1 true)
+
+ ; CHECK:= atomicrmw volatile usub_sat ptr addrspace(1) %ptr1, i32 43 syncscope("agent") seq_cst, align 4
+ %result10 = call i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) %ptr1, i32 43, i32 3, i32 0, i1 true)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result11 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr %ptr0, i64 42, i64 -1, i64 0, i1 true)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result12 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 0, i64 0, i1 true)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result13 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 1, i64 0, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") monotonic, align 8
+ %result14 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 2, i64 0, i1 true)
+
+ ; CHECK: atomicrmw usub_cond ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result15 = call i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 3, i64 0, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result16 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 4, i1 true)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result17 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 5, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result18 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 6, i1 true)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result19 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 7, i1 false)
+
+ ; CHECK:= atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result20 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 0, i64 8, i1 true)
+
+ ; CHECK:= atomicrmw volatile usub_sat ptr addrspace(1) %ptr1, i64 43 syncscope("agent") seq_cst, align 8
+ %result21 = call i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) %ptr1, i64 43, i64 3, i64 0, i1 true)
+ ret void
+}
+
+define void @immarg_violations_usub_sat(ptr %ptr0, i32 %val32, i1 %val1, i64 %val64) {
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") seq_cst, align 4
+ %result0 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 %val32, i32 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4
+ %result1 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 2, i32 %val32, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i32 42 syncscope("agent") monotonic, align 4
+ %result2 = call i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr %ptr0, i32 42, i32 2, i32 0, i1 %val1)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") seq_cst, align 8
+ %result3 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 %val64, i64 0, i1 false)
+
+ ; CHECK: atomicrmw usub_sat ptr %ptr0, i64 42 syncscope("agent") monotonic, align 8
+ %result4 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 2, i64 %val64, i1 false)
+
+ ; CHECK: atomicrmw volatile usub_sat ptr %ptr0, i64 42 syncscope("agent") monotonic, align 8
+ %result5 = call i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr %ptr0, i64 42, i64 2, i64 0, i1 %val1)
+ ret void
+}
+
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.cond.sub.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p1(ptr addrspace(1) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p3(ptr addrspace(3) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.cond.sub.i64.p0(ptr nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+
+declare i32 @llvm.amdgcn.atomic.csub.i32.p1(ptr addrspace(1) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.csub.i32.p3(ptr addrspace(3) nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i32 @llvm.amdgcn.atomic.csub.i32.p0(ptr nocapture, i32, i32 immarg, i32 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p1(ptr addrspace(1) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p3(ptr addrspace(3) nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
+declare i64 @llvm.amdgcn.atomic.csub.i64.p0(ptr nocapture, i64, i64 immarg, i64 immarg, i1 immarg) #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
deleted file mode 100644
index 59818b0b1bc39b..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ /dev/null
@@ -1,215 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX10
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
-
-define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
- ret i32 %ret
-}
-
-define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
- ret i32 %ret
-}
-
-define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
- ret void
-}
-
-define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_offset_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX11-LABEL: global_atomic_csub_offset_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
-; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX12-LABEL: global_atomic_csub_offset_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
-; GFX10-NEXT: s_waitcnt vmcnt(0)
-; GFX10-NEXT: global_store_dword v[0:1], v0, off
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-NEXT: s_waitcnt vmcnt(0)
-; GFX11-NEXT: global_store_b32 v[0:1], v0, off
-; GFX11-NEXT: s_nop 0
-; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: global_atomic_csub_sgpr_base_offset:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: global_store_b32 v[0:1], v0, off
-; GFX12-NEXT: s_nop 0
-; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
- store i32 %ret, ptr addrspace(1) undef
- ret void
-}
-
-define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
-; GFX10-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX10: ; %bb.0:
-; GFX10-NEXT: s_clause 0x1
-; GFX10-NEXT: s_load_dword s2, s[6:7], 0x8
-; GFX10-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
-; GFX10-NEXT: v_mov_b32_e32 v1, 0x1000
-; GFX10-NEXT: s_waitcnt lgkmcnt(0)
-; GFX10-NEXT: v_mov_b32_e32 v0, s2
-; GFX10-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
-; GFX10-NEXT: s_endpgm
-;
-; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX11: ; %bb.0:
-; GFX11-NEXT: s_clause 0x1
-; GFX11-NEXT: s_load_b32 s4, s[2:3], 0x8
-; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
-; GFX11-NEXT: s_waitcnt lgkmcnt(0)
-; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4
-; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
-; GFX11-NEXT: s_endpgm
-;
-; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_endpgm
- %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
- ret void
-}
-
-declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #1
-
-attributes #0 = { nounwind willreturn }
-attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll
new file mode 100644
index 00000000000000..1aca8fbe7323cc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw_cond_sub.ll
@@ -0,0 +1,197 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-GISEL
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-SDAG
+
+define i32 @global_atomic_usub_cond(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-GISEL-LABEL: global_atomic_usub_cond:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_cond:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
+ ret i32 %ret
+}
+
+define i32 @global_atomic_usub_cond_offset(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret i32 %ret
+}
+
+define void @global_atomic_usub_cond_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define void @global_atomic_usub_cond_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_offset_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_offset_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_sgpr_base_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v0, off
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_sgpr_base_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ store i32 %ret, ptr addrspace(1) undef
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_sgpr_base_offset_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_sgpr_base_offset_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+attributes #0 = { nounwind willreturn }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_sub_clamp.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_sub_clamp.ll
new file mode 100644
index 00000000000000..7ee3d0f870729c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw_sub_clamp.ll
@@ -0,0 +1,495 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10-GISEL
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10-GISEL
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11-GISEL
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-GISEL
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10-SDAG
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10-SDAG
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11-SDAG
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-SDAG
+
+define i32 @global_atomic_usub_sat(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
+ ret i32 %ret
+}
+
+define i32 @global_atomic_usub_sat_offset(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_offset:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_offset:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_offset:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_offset:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret i32 %ret
+}
+
+define void @global_atomic_usub_sat_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_nortn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_nortn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_nortn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_nortn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define void @global_atomic_usub_sat_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_sat_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x1000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: global_store_dword v[0:1], v0, off
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v0, off
+; GFX11-GISEL-NEXT: s_nop 0
+; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v0, off
+; GFX12-GISEL-NEXT: s_nop 0
+; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0x1000
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v0, v1, s[0:1] glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: global_store_dword v[0:1], v0, off
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x1000 :: v_dual_mov_b32 v1, s4
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v0, v1, s[0:1] glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v0, off
+; GFX11-SDAG-NEXT: s_nop 0
+; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_nop 0
+; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ store i32 %ret, ptr addrspace(1) undef
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_sat_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dword s2, s[6:7], 0x8
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x1000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[2:3], 0x8
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dword s2, s[6:7], 0x8
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[6:7], 0x0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0x1000
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v0, v1, s[0:1] glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s4, s[2:3], 0x8
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x1000 :: v_dual_mov_b32 v1, s4
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v0, v1, s[0:1] glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+attributes #0 = { nounwind willreturn }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index 417d38990505b6..a57d14c960cc1e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -2,60 +2,68 @@
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3), i32)
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1), i32)
-declare i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr, i32)
-
-define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32(ptr %addr, i32 %in) {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+ %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst
ret void
}
-define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+ %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst
ret void
}
-define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr %use) {
-; GFX12-SDAG-LABEL: flat_atomic_cond_sub_rtn_u32:
+define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr %use) {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24
@@ -63,13 +71,15 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s6
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: flat_atomic_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24
@@ -77,178 +87,208 @@ define amdgpu_kernel void @flat_atomic_cond_sub_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s4
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v1, s5
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p0(ptr %gep, i32 %in)
+ %val = atomicrmw usub_cond ptr %gep, i32 %in seq_cst
store i32 %val, ptr %use
ret void
}
-define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+ %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst
ret void
}
-define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16
-; GFX12-SDAG-NEXT: s_nop 0
-; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16
-; GFX12-GISEL-NEXT: s_nop 0
-; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+ %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst
ret void
}
-define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
-; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32:
+define amdgpu_kernel void @global_atomic_usub_cond_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[4:6], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[4:5] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-SDAG-NEXT: s_nop 0
; GFX12-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_clause 0x1
; GFX12-GISEL-NEXT: s_load_b96 s[4:6], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[4:5] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12-GISEL-NEXT: s_nop 0
; GFX12-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p1(ptr addrspace(1) %gep, i32 %in)
+ %val = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst
store i32 %val, ptr addrspace(1) %use
ret void
}
-define amdgpu_kernel void @ds_cond_sub_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
-; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32:
+define amdgpu_kernel void @ds_usub_cond_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
+; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32:
+; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+ %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst
ret void
}
-define amdgpu_kernel void @ds_cond_sub_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
-; GFX12-SDAG-LABEL: ds_cond_sub_no_rtn_u32_forced:
+define amdgpu_kernel void @ds_usub_cond_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32_forced:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: ds_cond_sub_no_rtn_u32_forced:
+; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4
- %unused = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+ %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst
ret void
}
-define amdgpu_kernel void @ds_cond_sub_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
-; GFX12-SDAG-LABEL: ds_cond_sub_rtn_u32:
+define amdgpu_kernel void @ds_usub_cond_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
+; GFX12-SDAG-LABEL: ds_usub_cond_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SE
; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX12-SDAG-NEXT: ds_store_b32 v1, v0
; GFX12-SDAG-NEXT: s_endpgm
;
-; GFX12-GISEL-LABEL: ds_cond_sub_rtn_u32:
+; GFX12-GISEL-LABEL: ds_usub_cond_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[2:3], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SE
; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
; GFX12-GISEL-NEXT: ds_store_b32 v1, v0
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(3) %addr, i32 4
- %val = call i32 @llvm.amdgcn.atomic.cond.sub.u32.p3(ptr addrspace(3) %gep, i32 %in)
+ %val = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst
store i32 %val, ptr addrspace(3) %use
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
index b23249570faa7d..7daa4b4baada4c 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
@@ -8,12 +8,12 @@
define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; OPT-LABEL: @test_sink_small_offset_global_atomic_csub_i32(
; OPT-NEXT: entry:
-; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
+; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR2:[0-9]+]]
; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0
; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
; OPT: if:
; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN:%.*]], i32 7
-; OPT-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) [[IN_GEP]], i32 2)
+; OPT-NEXT: [[VAL:%.*]] = atomicrmw usub_sat ptr addrspace(1) [[IN_GEP]], i32 2 seq_cst, align 4
; OPT-NEXT: br label [[ENDIF]]
; OPT: endif:
; OPT-NEXT: [[X:%.*]] = phi i32 [ [[VAL]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
@@ -36,10 +36,13 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(ptr add
; GCN-NEXT: v_mov_b32_e32 v1, 2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] offset:28 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_gl1_inv
+; GCN-NEXT: buffer_gl0_inv
; GCN-NEXT: .LBB0_2: ; %endif
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0800
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:252
; GCN-NEXT: s_endpgm
entry:
@@ -49,7 +52,7 @@ entry:
if:
%in.gep = getelementptr i32, ptr addrspace(1) %in, i32 7
- %val = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %in.gep, i32 2)
+ %val = atomicrmw usub_sat ptr addrspace(1) %in.gep, i32 2 seq_cst
br label %endif
endif:
@@ -62,7 +65,6 @@ done:
ret void
}
-declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #0
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
attributes #0 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll
index 79de55eb63bf81..bd0fd79bab8348 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll
@@ -10,10 +10,12 @@ define amdgpu_ps float @global_csub_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_gl1_inv
+; GCN-NEXT: buffer_gl0_inv
; GCN-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep0, i32 %data)
+ %rtn = atomicrmw usub_sat ptr addrspace(1) %gep0, i32 %data seq_cst
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -23,11 +25,13 @@ define amdgpu_ps float @global_csub_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] offset:-128 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_gl1_inv
+; GCN-NEXT: buffer_gl0_inv
; GCN-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep1, i32 %data)
+ %rtn = atomicrmw usub_sat ptr addrspace(1) %gep1, i32 %data seq_cst
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -36,10 +40,13 @@ define amdgpu_ps void @global_csub_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
; GCN-LABEL: global_csub_saddr_i32_nortn:
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_gl1_inv
+; GCN-NEXT: buffer_gl0_inv
; GCN-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep0, i32 %data)
+ %unused = atomicrmw usub_sat ptr addrspace(1) %gep0, i32 %data seq_cst
ret void
}
@@ -47,11 +54,14 @@ define amdgpu_ps void @global_csub_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
; GCN-LABEL: global_csub_saddr_i32_nortn_neg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] offset:-128 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_gl1_inv
+; GCN-NEXT: buffer_gl0_inv
; GCN-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep1, i32 %data)
+ %unused = atomicrmw usub_sat ptr addrspace(1) %gep1, i32 %data seq_cst
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
deleted file mode 100644
index 99f4fbf3599483..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.cond.sub.ll
+++ /dev/null
@@ -1,225 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
-
-define float @raw_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s6
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-define void @raw_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s6
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
- ret void
-}
-
-define void @raw_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_no_return_forced:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s6
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], null
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
- ret void
-}
-
-define float @raw_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s6
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %orig = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-define void @raw_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s6
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4 th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
- ret void
-}
-
-define void @raw_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: raw_buffer_atomic_cond_sub_imm_soff_no_return_forced:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_mov_b32_e32 v0, s6
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, off, s[0:3], s4
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 4, i32 0)
- ret void
-}
-
-define float @struct_buffer_atomic_cond_sub_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], null idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-define void @struct_buffer_atomic_cond_sub_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
- ret void
-}
-
-define void @struct_buffer_atomic_cond_sub_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_no_return_forced:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], null idxen
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
- ret void
-}
-
-define float @struct_buffer_atomic_cond_sub_imm_soff_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v0, v1, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %orig = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
- %r = bitcast i32 %orig to float
- ret float %r
-}
-
-define void @struct_buffer_atomic_cond_sub_imm_soff_no_return(<4 x i32> inreg %rsrc, i32 inreg %data) #0 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen th:TH_ATOMIC_RETURN
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
- ret void
-}
-
-define void @struct_buffer_atomic_cond_sub_imm_soff_no_return_forced(<4 x i32> inreg %rsrc, i32 inreg %data) #1 {
-; GFX12-LABEL: struct_buffer_atomic_cond_sub_imm_soff_no_return_forced:
-; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
-; GFX12-NEXT: s_mov_b32 s4, 4
-; GFX12-NEXT: buffer_atomic_cond_sub_u32 v1, v0, s[0:3], s4 idxen
-; GFX12-NEXT: s_wait_alu 0xfffe
-; GFX12-NEXT: s_setpc_b64 s[30:31]
-main_body:
- %unused = call i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32 %data, <4 x i32> %rsrc, i32 0, i32 0, i32 4, i32 0)
- ret void
-}
-
-declare i32 @llvm.amdgcn.raw.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32) #0
-declare i32 @llvm.amdgcn.struct.buffer.atomic.cond.sub.u32.i32(i32, <4 x i32>, i32, i32, i32, i32) #0
-
-attributes #0 = { nounwind }
-attributes #1 = { nounwind "target-features"="+atomic-csub-no-rtn-insts" }
-
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
index 4a66b761306f3d..ccf9a52f786263 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
@@ -2,14 +2,12 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1031 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,PREGFX12
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck %s -check-prefixes=GCN,GFX12PLUS
-declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32)
-
; GCN-LABEL: {{^}}global_atomic_csub_rtn:
; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc
; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} th:TH_ATOMIC_RETURN
define amdgpu_kernel void @global_atomic_csub_rtn(ptr addrspace(1) %ptr, i32 %data) {
main_body:
- %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst
ret void
}
@@ -18,7 +16,7 @@ main_body:
; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @global_atomic_csub_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
main_body:
- %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst
ret void
}
@@ -28,7 +26,7 @@ main_body:
define amdgpu_kernel void @global_atomic_csub_off4_rtn(ptr addrspace(1) %ptr, i32 %data) {
main_body:
%p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
- %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %p, i32 %data)
+ %ret = atomicrmw usub_sat ptr addrspace(1) %p, i32 %data seq_cst
ret void
}
@@ -38,7 +36,7 @@ main_body:
define amdgpu_kernel void @global_atomic_csub_off4_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
main_body:
%p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
- %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %p, i32 %data)
+ %ret = atomicrmw usub_sat ptr addrspace(1) %p, i32 %data seq_cst
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index 0d88466fc31b3e..5b09e50738ea59 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -632,3 +632,55 @@ define i32 @atomicrmw_dec_private_i32(ptr addrspace(5) %ptr) {
%result = atomicrmw udec_wrap ptr addrspace(5) %ptr, i32 4 seq_cst
ret i32 %result
}
+
+define i32 @atomicrmw_usub_cond_private_i32(ptr addrspace(5) %ptr) {
+; IR-LABEL: define i32 @atomicrmw_usub_cond_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
+; IR-NEXT: [[TMP2:%.*]] = icmp uge i32 [[TMP1]], 4
+; IR-NEXT: [[TMP3:%.*]] = sub i32 [[TMP1]], 4
+; IR-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[TMP3]], i32 4
+; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
+; IR-NEXT: ret i32 [[TMP1]]
+;
+; GCN-LABEL: atomicrmw_usub_cond_private_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_add_i32_e32 v2, vcc, -4, v1
+; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 3, v1
+; GCN-NEXT: v_cndmask_b32_e32 v2, 4, v2, vcc
+; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw usub_cond ptr addrspace(5) %ptr, i32 4 seq_cst
+ ret i32 %result
+}
+
+define i32 @atomicrmw_usub_sat_private_i32(ptr addrspace(5) %ptr) {
+; IR-LABEL: define i32 @atomicrmw_usub_sat_private_i32(
+; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
+; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
+; IR-NEXT: [[TMP2:%.*]] = icmp uge i32 [[TMP1]], 4
+; IR-NEXT: [[TMP3:%.*]] = sub i32 [[TMP1]], 4
+; IR-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[TMP3]], i32 0
+; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
+; IR-NEXT: ret i32 [[TMP1]]
+;
+; GCN-LABEL: atomicrmw_usub_sat_private_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_add_i32_e32 v2, vcc, -4, v1
+; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 3, v1
+; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw usub_sat ptr addrspace(5) %ptr, i32 4 seq_cst
+ ret i32 %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
index c04cb89e9527b6..2beef9fd8e718f 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
@@ -12,7 +12,7 @@ define i32 @shl_base_atomicrmw_global_atomic_csub_ptr(ptr addrspace(1) %out, ptr
%cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
%shl = shl i64 %cast, 2
%castback = inttoptr i64 %shl to ptr addrspace(1)
- %val = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %castback, i32 43)
+ %val = atomicrmw usub_sat ptr addrspace(1) %castback, i32 43 seq_cst
store volatile i64 %cast, ptr addrspace(1) %extra.use, align 4
ret i32 %val
}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
index b8196cfcc35108..2d2f0c48861176 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
@@ -1366,6 +1366,364 @@ define i16 @test_atomicrmw_add_i16_buffer_fat_agent_align4(ptr addrspace(7) %ptr
ret i16 %res
}
+define i16 @test_atomicrmw_usub_cond_i16_global_agent(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_global_agent(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP7:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i16 [[TMP7]], i16 [[VALUE]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_cond_i16_global_agent_align4(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_global_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i16 [[TMP5]], i16 [[VALUE]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_cond_i16_local(ptr addrspace(3) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_local(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP7:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i16 [[TMP7]], i16 [[VALUE]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(3) %ptr, i16 %value syncscope("agent") seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_cond_i16_local_align4(ptr addrspace(3) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_local_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i16 [[TMP5]], i16 [[VALUE]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(3) %ptr, i16 %value syncscope("agent") seq_cst, align 4
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_cond_i16_flat_agent(ptr %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_flat_agent(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP7:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i16 [[TMP7]], i16 [[VALUE]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i16 %value syncscope("agent") seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_cond_i16_flat_agent_align4(ptr %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_flat_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[TMP3:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i16 [[TMP5]], i16 [[VALUE]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i16 %value syncscope("agent") seq_cst, align 4
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_global_agent(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_global_agent(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i16 [[TMP5]], i16 0
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_global_agent_align4(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_global_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i16 [[TMP3]], i16 0
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_local(ptr addrspace(3) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_local(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i16 [[TMP5]], i16 0
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(3) %ptr, i16 %value seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_local_align4(ptr addrspace(3) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_local_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i16 [[TMP3]], i16 0
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(3) %ptr, i16 %value seq_cst, align 4
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_flat_agent(ptr %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_flat_agent(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i16 [[TMP5]], i16 0
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i16 %value syncscope("agent") seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_flat_agent_align4(ptr %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_flat_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i16 [[TMP3]], i16 0
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i16 %value syncscope("agent") seq_cst, align 4
+ ret i16 %res
+}
+
!0 = !{}
!1 = !{!"foo", !"bar"}
!2 = !{!3}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
index 590ee63001615a..6d0540b46b2004 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
@@ -1712,3 +1712,801 @@ define i8 @test_atomicrmw_add_i8_buffer_fat_agent_align4(ptr addrspace(7) %ptr,
%res = atomicrmw add ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst, align 4
ret i8 %res
}
+
+define i8 @test_atomicrmw_usub_cond_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_cond_i8_global_agent(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_cond_i8_global_agent(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_global_agent_align2(ptr addrspace(1) %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_cond_i8_global_agent_align2(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_cond_i8_global_agent_align2(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_global_agent_align4(ptr addrspace(1) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_global_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[TMP3:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i8 [[TMP5]], i8 [[VALUE]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 4
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_local(ptr addrspace(3) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_local(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(3) %ptr, i8 %value seq_cst
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_local_align2(ptr addrspace(3) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_local_align2(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(3) %ptr, i8 %value seq_cst, align 2
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_local_align4(ptr addrspace(3) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_local_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[TMP3:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i8 [[TMP5]], i8 [[VALUE]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(3) %ptr, i8 %value seq_cst, align 4
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_flat_agent(ptr %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_cond_i8_flat_agent(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_cond_i8_flat_agent(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i8 %value syncscope("agent") seq_cst
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_flat_agent_align2(ptr %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_cond_i8_flat_agent_align2(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_cond_i8_flat_agent_align2(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i8 %value syncscope("agent") seq_cst, align 2
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_flat_agent_align4(ptr %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_flat_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[TMP3:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i8 [[TMP5]], i8 [[VALUE]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i8 %value syncscope("agent") seq_cst, align 4
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_sat_i8_global_agent(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_sat_i8_global_agent(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_global_agent_align2(ptr addrspace(1) %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_sat_i8_global_agent_align2(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_sat_i8_global_agent_align2(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_global_agent_align4(ptr addrspace(1) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_global_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i8 [[TMP3]], i8 0
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 4
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_local(ptr addrspace(3) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_local(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(3) %ptr, i8 %value seq_cst
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_local_align2(ptr addrspace(3) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_local_align2(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(3) %ptr, i8 %value seq_cst, align 2
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_local_align4(ptr addrspace(3) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_local_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i8 [[TMP3]], i8 0
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(3) %ptr, i8 %value seq_cst, align 4
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_flat_agent(ptr %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_sat_i8_flat_agent(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_sat_i8_flat_agent(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i8 %value syncscope("agent") seq_cst
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_flat_agent_align2(ptr %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_sat_i8_flat_agent_align2(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_sat_i8_flat_agent_align2(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i8 %value syncscope("agent") seq_cst, align 2
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_flat_agent_align4(ptr %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_flat_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i8 [[TMP3]], i8 0
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i8 %value syncscope("agent") seq_cst, align 4
+ ret i8 %res
+}
>From 2eb357a694c1aae0710cb0737b58ca90191dfaf8 Mon Sep 17 00:00:00 2001
From: Andrew Jenner <Andrew.Jenner at amd.com>
Date: Thu, 22 Aug 2024 10:51:02 -0400
Subject: [PATCH 2/2] [AMDGPU] Feedback from pull request.
---
.../CodeGen/AMDGPU/private-memory-atomics.ll | 23 +-
.../AtomicExpand/AMDGPU/expand-atomic-i16.ll | 96 ++++----
.../AtomicExpand/AMDGPU/expand-atomic-i8.ll | 208 ++++++++----------
3 files changed, 142 insertions(+), 185 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index 5b09e50738ea59..cb167560c827ac 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -634,12 +634,11 @@ define i32 @atomicrmw_dec_private_i32(ptr addrspace(5) %ptr) {
}
define i32 @atomicrmw_usub_cond_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: define i32 @atomicrmw_usub_cond_private_i32(
-; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
-; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
+; IR-LABEL: @atomicrmw_usub_cond_private_i32(
+; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
; IR-NEXT: [[TMP2:%.*]] = icmp uge i32 [[TMP1]], 4
; IR-NEXT: [[TMP3:%.*]] = sub i32 [[TMP1]], 4
-; IR-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[TMP3]], i32 4
+; IR-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[TMP3]], i32 [[TMP1]]
; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
; IR-NEXT: ret i32 [[TMP1]]
;
@@ -650,7 +649,7 @@ define i32 @atomicrmw_usub_cond_private_i32(ptr addrspace(5) %ptr) {
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_add_i32_e32 v2, vcc, -4, v1
; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 3, v1
-; GCN-NEXT: v_cndmask_b32_e32 v2, 4, v2, vcc
+; GCN-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc
; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
@@ -660,12 +659,9 @@ define i32 @atomicrmw_usub_cond_private_i32(ptr addrspace(5) %ptr) {
}
define i32 @atomicrmw_usub_sat_private_i32(ptr addrspace(5) %ptr) {
-; IR-LABEL: define i32 @atomicrmw_usub_sat_private_i32(
-; IR-SAME: ptr addrspace(5) [[PTR:%.*]]) #[[ATTR0]] {
-; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR]], align 4
-; IR-NEXT: [[TMP2:%.*]] = icmp uge i32 [[TMP1]], 4
-; IR-NEXT: [[TMP3:%.*]] = sub i32 [[TMP1]], 4
-; IR-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[TMP3]], i32 0
+; IR-LABEL: @atomicrmw_usub_sat_private_i32(
+; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-NEXT: [[NEW:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[TMP1]], i32 4)
; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
; IR-NEXT: ret i32 [[TMP1]]
;
@@ -674,9 +670,8 @@ define i32 @atomicrmw_usub_sat_private_i32(ptr addrspace(5) %ptr) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_add_i32_e32 v2, vcc, -4, v1
-; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 3, v1
-; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc
+; GCN-NEXT: v_max_u32_e32 v2, 4, v1
+; GCN-NEXT: v_add_i32_e32 v2, vcc, -4, v2
; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
; GCN-NEXT: v_mov_b32_e32 v0, v1
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
index 2d2f0c48861176..1d9993cc774f94 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
@@ -1381,9 +1381,9 @@ define i16 @test_atomicrmw_usub_cond_i16_global_agent(ptr addrspace(1) %ptr, i16
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i16 [[TMP7]], i16 [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i16 [[TMP5]], i16 [[EXTRACTED]]
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
@@ -1408,9 +1408,9 @@ define i16 @test_atomicrmw_usub_cond_i16_global_agent_align4(ptr addrspace(1) %p
; CHECK: atomicrmw.start:
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; CHECK-NEXT: [[TMP3:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i16 [[TMP5]], i16 [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i16 [[TMP3]], i16 [[EXTRACTED]]
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
@@ -1440,9 +1440,9 @@ define i16 @test_atomicrmw_usub_cond_i16_local(ptr addrspace(3) %ptr, i16 %value
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i16 [[TMP7]], i16 [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i16 [[TMP5]], i16 [[EXTRACTED]]
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
@@ -1467,9 +1467,9 @@ define i16 @test_atomicrmw_usub_cond_i16_local_align4(ptr addrspace(3) %ptr, i16
; CHECK: atomicrmw.start:
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; CHECK-NEXT: [[TMP3:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i16 [[TMP5]], i16 [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i16 [[TMP3]], i16 [[EXTRACTED]]
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
@@ -1500,9 +1500,9 @@ define i16 @test_atomicrmw_usub_cond_i16_flat_agent(ptr %ptr, i16 %value) {
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i16 [[TMP7]], i16 [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i16 [[TMP5]], i16 [[EXTRACTED]]
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
@@ -1527,9 +1527,9 @@ define i16 @test_atomicrmw_usub_cond_i16_flat_agent_align4(ptr %ptr, i16 %value)
; CHECK: atomicrmw.start:
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; CHECK-NEXT: [[TMP3:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i16 [[TMP5]], i16 [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i16 [[TMP3]], i16 [[EXTRACTED]]
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
@@ -1560,16 +1560,14 @@ define i16 @test_atomicrmw_usub_sat_i16_global_agent(ptr addrspace(1) %ptr, i16
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i16 [[TMP5]], i16 0
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
@@ -1587,15 +1585,13 @@ define i16 @test_atomicrmw_usub_sat_i16_global_agent_align4(ptr addrspace(1) %pt
; CHECK: atomicrmw.start:
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i16 [[TMP3]], i16 0
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
@@ -1619,16 +1615,14 @@ define i16 @test_atomicrmw_usub_sat_i16_local(ptr addrspace(3) %ptr, i16 %value)
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i16 [[TMP5]], i16 0
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
@@ -1646,15 +1640,13 @@ define i16 @test_atomicrmw_usub_sat_i16_local_align4(ptr addrspace(3) %ptr, i16
; CHECK: atomicrmw.start:
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i16 [[TMP3]], i16 0
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
@@ -1679,16 +1671,14 @@ define i16 @test_atomicrmw_usub_sat_i16_flat_agent(ptr %ptr, i16 %value) {
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
-; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i16 [[TMP5]], i16 0
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
@@ -1706,15 +1696,13 @@ define i16 @test_atomicrmw_usub_sat_i16_flat_agent_align4(ptr %ptr, i16 %value)
; CHECK: atomicrmw.start:
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
-; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i16 [[TMP3]], i16 0
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
index 6d0540b46b2004..42ace87e526b9d 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
@@ -1728,9 +1728,9 @@ define i8 @test_atomicrmw_usub_cond_i8_global_agent(ptr addrspace(1) %ptr, i8 %v
; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; GCN-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; GCN-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
@@ -1757,9 +1757,9 @@ define i8 @test_atomicrmw_usub_cond_i8_global_agent(ptr addrspace(1) %ptr, i8 %v
; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; R600-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; R600-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
@@ -1792,9 +1792,9 @@ define i8 @test_atomicrmw_usub_cond_i8_global_agent_align2(ptr addrspace(1) %ptr
; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; GCN-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; GCN-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
@@ -1821,9 +1821,9 @@ define i8 @test_atomicrmw_usub_cond_i8_global_agent_align2(ptr addrspace(1) %ptr
; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; R600-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; R600-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
@@ -1848,9 +1848,9 @@ define i8 @test_atomicrmw_usub_cond_i8_global_agent_align4(ptr addrspace(1) %ptr
; CHECK: atomicrmw.start:
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
-; CHECK-NEXT: [[TMP3:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i8 [[TMP5]], i8 [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i8 [[TMP3]], i8 [[EXTRACTED]]
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
@@ -1880,9 +1880,9 @@ define i8 @test_atomicrmw_usub_cond_i8_local(ptr addrspace(3) %ptr, i8 %value) {
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
@@ -1914,9 +1914,9 @@ define i8 @test_atomicrmw_usub_cond_i8_local_align2(ptr addrspace(3) %ptr, i8 %v
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
@@ -1941,9 +1941,9 @@ define i8 @test_atomicrmw_usub_cond_i8_local_align4(ptr addrspace(3) %ptr, i8 %v
; CHECK: atomicrmw.start:
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
-; CHECK-NEXT: [[TMP3:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i8 [[TMP5]], i8 [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i8 [[TMP3]], i8 [[EXTRACTED]]
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
@@ -1974,9 +1974,9 @@ define i8 @test_atomicrmw_usub_cond_i8_flat_agent(ptr %ptr, i8 %value) {
; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; GCN-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; GCN-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
@@ -2003,9 +2003,9 @@ define i8 @test_atomicrmw_usub_cond_i8_flat_agent(ptr %ptr, i8 %value) {
; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; R600-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; R600-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
@@ -2038,9 +2038,9 @@ define i8 @test_atomicrmw_usub_cond_i8_flat_agent_align2(ptr %ptr, i8 %value) {
; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; GCN-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; GCN-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
@@ -2067,9 +2067,9 @@ define i8 @test_atomicrmw_usub_cond_i8_flat_agent_align2(ptr %ptr, i8 %value) {
; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; R600-NEXT: [[TMP5:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; R600-NEXT: [[TMP7:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP5]], i8 [[TMP7]], i8 [[VALUE]]
+; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
@@ -2094,9 +2094,9 @@ define i8 @test_atomicrmw_usub_cond_i8_flat_agent_align4(ptr %ptr, i8 %value) {
; CHECK: atomicrmw.start:
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
-; CHECK-NEXT: [[TMP3:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP3]], i8 [[TMP5]], i8 [[VALUE]]
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i8 [[TMP3]], i8 [[EXTRACTED]]
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
@@ -2127,16 +2127,14 @@ define i8 @test_atomicrmw_usub_sat_i8_global_agent(ptr addrspace(1) %ptr, i8 %va
; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; GCN-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; GCN: atomicrmw.end:
; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
@@ -2156,16 +2154,14 @@ define i8 @test_atomicrmw_usub_sat_i8_global_agent(ptr addrspace(1) %ptr, i8 %va
; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; R600-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; R600: atomicrmw.end:
; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
@@ -2191,16 +2187,14 @@ define i8 @test_atomicrmw_usub_sat_i8_global_agent_align2(ptr addrspace(1) %ptr,
; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; GCN-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; GCN: atomicrmw.end:
; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
@@ -2220,16 +2214,14 @@ define i8 @test_atomicrmw_usub_sat_i8_global_agent_align2(ptr addrspace(1) %ptr,
; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; R600-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; R600: atomicrmw.end:
; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
@@ -2247,15 +2239,13 @@ define i8 @test_atomicrmw_usub_sat_i8_global_agent_align4(ptr addrspace(1) %ptr,
; CHECK: atomicrmw.start:
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
-; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i8 [[TMP3]], i8 0
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
@@ -2279,16 +2269,14 @@ define i8 @test_atomicrmw_usub_sat_i8_local(ptr addrspace(3) %ptr, i8 %value) {
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
@@ -2313,16 +2301,14 @@ define i8 @test_atomicrmw_usub_sat_i8_local_align2(ptr addrspace(3) %ptr, i8 %va
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; CHECK-NEXT: [[TMP8:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
@@ -2340,15 +2326,13 @@ define i8 @test_atomicrmw_usub_sat_i8_local_align4(ptr addrspace(3) %ptr, i8 %va
; CHECK: atomicrmw.start:
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
-; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i8 [[TMP3]], i8 0
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
@@ -2373,16 +2357,14 @@ define i8 @test_atomicrmw_usub_sat_i8_flat_agent(ptr %ptr, i8 %value) {
; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; GCN-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; GCN: atomicrmw.end:
; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
@@ -2402,16 +2384,14 @@ define i8 @test_atomicrmw_usub_sat_i8_flat_agent(ptr %ptr, i8 %value) {
; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; R600-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; R600: atomicrmw.end:
; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
@@ -2437,16 +2417,14 @@ define i8 @test_atomicrmw_usub_sat_i8_flat_agent_align2(ptr %ptr, i8 %value) {
; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; GCN-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; GCN-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; GCN-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; GCN: atomicrmw.end:
; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
@@ -2466,16 +2444,14 @@ define i8 @test_atomicrmw_usub_sat_i8_flat_agent_align2(ptr %ptr, i8 %value) {
; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
-; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 0
+; R600-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
-; R600-NEXT: [[TMP8:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP8]], 1
-; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP8]], 0
+; R600-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; R600: atomicrmw.end:
; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
@@ -2493,15 +2469,13 @@ define i8 @test_atomicrmw_usub_sat_i8_flat_agent_align4(ptr %ptr, i8 %value) {
; CHECK: atomicrmw.start:
; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
-; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
-; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
-; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i8 [[TMP3]], i8 0
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
-; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
-; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
-; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
; CHECK: atomicrmw.end:
; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
More information about the llvm-commits
mailing list