[llvm] AMDGPU: Add codegen for atomicrmw operations usub_cond and usub_sat (PR #141068)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Jul 24 07:14:33 PDT 2025
https://github.com/anjenner updated https://github.com/llvm/llvm-project/pull/141068
>From 4a09a30d1e1af5f6b074f7980b46d2786e818c39 Mon Sep 17 00:00:00 2001
From: Andrew Jenner <Andrew.Jenner at amd.com>
Date: Thu, 22 May 2025 10:07:12 -0400
Subject: [PATCH 1/4] AMDGPU: Add codegen for atomicrmw operations usub_cond
and usub_sat
Split off from https://github.com/llvm/llvm-project/pull/105553 as per discussion there.
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 2 +
llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 2 +
.../AMDGPU/AMDGPUInstructionSelector.cpp | 2 +
llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 2 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 7 +
.../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 11 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 2 +
llvm/lib/Target/AMDGPU/DSInstructions.td | 35 +
llvm/lib/Target/AMDGPU/FLATInstructions.td | 14 +-
llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 8 +
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 13 +-
llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 +
.../AMDGPU/MIR/atomics-gmir.mir | 6 +
.../CodeGen/AMDGPU/atomicrmw_usub_cond.ll | 482 +++++++++++
.../test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll | 487 +++++++++++
.../AMDGPU/cgp-addressing-modes-gfx1030.ll | 12 +-
.../AMDGPU/global-saddr-atomics.gfx1030.ll | 18 +-
.../AMDGPU/llvm.amdgcn.global.atomic.csub.ll | 10 +-
.../CodeGen/AMDGPU/private-memory-atomics.ll | 46 ++
llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll | 2 +-
.../AMDGPU/expand-atomic-i16-system.ll | 233 ++++++
.../AtomicExpand/AMDGPU/expand-atomic-i16.ll | 346 ++++++++
.../AMDGPU/expand-atomic-i32-agent.ll | 90 ++
.../AMDGPU/expand-atomic-i32-system.ll | 90 ++
.../AMDGPU/expand-atomic-i64-agent.ll | 170 ++++
.../AMDGPU/expand-atomic-i64-system.ll | 170 ++++
.../AMDGPU/expand-atomic-i8-system.ll | 368 +++++++++
.../AtomicExpand/AMDGPU/expand-atomic-i8.ll | 772 ++++++++++++++++++
28 files changed, 3374 insertions(+), 28 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 8bfa34584c3a4..a6555ef77ab28 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1624,6 +1624,8 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_usub_cond : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_usub_sat : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 891d362503f15..7c5a06a46d0b1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -288,6 +288,8 @@ def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
// FIXME: Check MMO is atomic
def : GINodeEquiv<G_ATOMICRMW_UINC_WRAP, atomic_load_uinc_wrap_glue>;
def : GINodeEquiv<G_ATOMICRMW_UDEC_WRAP, atomic_load_udec_wrap_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_COND, atomic_load_usub_cond_glue>;
+def : GINodeEquiv<G_ATOMICRMW_USUB_SAT, atomic_load_usub_sat_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMIN, atomic_load_fmin_glue>;
def : GINodeEquiv<G_ATOMICRMW_FMAX, atomic_load_fmax_glue>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 877c3ac34d555..bff9660153a11 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -4154,6 +4154,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_ATOMICRMW_UMAX:
case TargetOpcode::G_ATOMICRMW_UINC_WRAP:
case TargetOpcode::G_ATOMICRMW_UDEC_WRAP:
+ case TargetOpcode::G_ATOMICRMW_USUB_COND:
+ case TargetOpcode::G_ATOMICRMW_USUB_SAT:
case TargetOpcode::G_ATOMICRMW_FADD:
case TargetOpcode::G_ATOMICRMW_FMIN:
case TargetOpcode::G_ATOMICRMW_FMAX:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 7a50923ffedc6..e3d4dfc039799 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -694,6 +694,8 @@ defm atomic_load_fmin : binary_atomic_op_fp_all_as<atomic_load_fmin>;
defm atomic_load_fmax : binary_atomic_op_fp_all_as<atomic_load_fmax>;
defm atomic_load_uinc_wrap : binary_atomic_op_all_as<atomic_load_uinc_wrap>;
defm atomic_load_udec_wrap : binary_atomic_op_all_as<atomic_load_udec_wrap>;
+defm atomic_load_usub_cond : binary_atomic_op_all_as<atomic_load_usub_cond>;
+defm atomic_load_usub_sat : binary_atomic_op_all_as<atomic_load_usub_sat>;
defm AMDGPUatomic_cmp_swap : binary_atomic_op_all_as<AMDGPUatomic_cmp_swap>;
def load_align8_local : PatFrag<(ops node:$ptr), (load_local node:$ptr)>,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index fedfa3f9dd900..040cdf1d3b775 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -1668,6 +1668,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
}
+ auto &Atomics32 =
+ getActionDefinitionsBuilder({G_ATOMICRMW_USUB_COND, G_ATOMICRMW_USUB_SAT})
+ .legalFor({{S32, GlobalPtr}, {S32, LocalPtr}, {S32, RegionPtr}});
+ if (ST.hasFlatAddressSpace()) {
+ Atomics32.legalFor({{S32, FlatPtr}});
+ }
+
// TODO: v2bf16 operations, and fat buffer pointer support.
auto &Atomic = getActionDefinitionsBuilder(G_ATOMICRMW_FADD);
if (ST.hasLDSFPAtomicAddF32()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 304e91ec184f1..7d5a390a66162 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1742,6 +1742,12 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::FMin:
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin;
break;
+ case AtomicRMWInst::USubCond:
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_cond;
+ break;
+ case AtomicRMWInst::USubSat:
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_sat;
+ break;
case AtomicRMWInst::FSub: {
reportFatalUsageError(
"atomic floating point subtraction not supported for "
@@ -1768,13 +1774,10 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
reportFatalUsageError("wrapping increment/decrement not supported for "
- "buffer resources and should've ben expanded away");
+ "buffer resources and should've been expanded away");
break;
case AtomicRMWInst::BAD_BINOP:
llvm_unreachable("Not sure how we got a bad binop");
- case AtomicRMWInst::USubCond:
- case AtomicRMWInst::USubSat:
- break;
}
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 9b05f7c339738..5276c96433b53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5518,6 +5518,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_ATOMICRMW_FMAX:
case AMDGPU::G_ATOMICRMW_UINC_WRAP:
case AMDGPU::G_ATOMICRMW_UDEC_WRAP:
+ case AMDGPU::G_ATOMICRMW_USUB_COND:
+ case AMDGPU::G_ATOMICRMW_USUB_SAT:
case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 319cc9d1da181..1198051de793c 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1133,7 +1133,34 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
}
}
+multiclass DSAtomicRetNoRetPatCondSub_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
+ ValueType vt, string frag> {
+ let OtherPredicates = [LDSRequiresM0Init] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_local_m0_"#vt)>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
+ }
+ let OtherPredicates = [NotLDSRequiresM0Init] in {
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_"#vt)>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
+ }
+
+ let OtherPredicates = [HasGDS] in {
+ def : DSAtomicRetPat<inst, vt,
+ !cast<PatFrag>(frag#"_region_m0_"#vt),
+ /* complexity */ 0, /* gds */ 1>;
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
+ /* complexity */ 1, /* gds */ 1>;
+ }
+}
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
@@ -1216,6 +1243,14 @@ defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_F16, DS_PK_ADD_F16, v2f16, "atomic_l
defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomic_load_fadd">;
}
+let SubtargetPredicate = isGFX12Plus in {
+
+defm : DSAtomicRetNoRetPatCondSub_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">;
+
+defm : DSAtomicRetNoRetPat_mc<DS_SUB_CLAMP_RTN_U32, DS_SUB_CLAMP_U32, i32, "atomic_load_usub_sat">;
+
+} // let SubtargetPredicate = isGFX12Plus
+
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
defm : DSAtomicCmpXChgSwapped_mc<DS_CMPST_RTN_B32, DS_CMPST_B32, i32, "atomic_cmp_swap">;
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 679c55dd0ea48..b3b76f99f3571 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1806,6 +1806,12 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MIN_F64", "atomic_load_fmin_"#as, f64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
}
+let SubtargetPredicate = isGFX12Plus in {
+ defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_" #as, i32 >;
+
+ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>;
+}
} // end foreach as
defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
@@ -1963,10 +1969,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_OR", "atomic_load_or_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SWAP", "atomic_swap_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_global", i32, v2i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
-defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
-defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
+defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "atomic_load_usub_sat_global", i32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_SUB_X2", "atomic_load_sub_global", i64>;
@@ -1983,10 +1989,10 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP_X2", "AMDGPUatomic_cmp_swap_
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i64>;
let SubtargetPredicate = isGFX12Plus in {
- defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+ defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
+ defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
}
let OtherPredicates = [isGFX12PlusNot12_50] in
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 3c8bfa629ed3d..e452e206dad2d 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -2191,6 +2191,14 @@ R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
// FIXME: Cayman at least appears to have instructions for this, but the
// instruction defintions appear to be missing.
return AtomicExpansionKind::CmpXChg;
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
+ if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
+ unsigned Size = IntTy->getBitWidth();
+ if (Size == 32)
+ return AtomicExpansionKind::None;
+ }
+ return AtomicExpansionKind::CmpXChg;
case AtomicRMWInst::Xchg: {
const DataLayout &DL = RMW->getFunction()->getDataLayout();
unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index f1a8ee118356e..9aac5fc2152a3 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1035,6 +1035,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
ISD::ATOMIC_LOAD_FMAX,
ISD::ATOMIC_LOAD_UINC_WRAP,
ISD::ATOMIC_LOAD_UDEC_WRAP,
+ ISD::ATOMIC_LOAD_USUB_COND,
+ ISD::ATOMIC_LOAD_USUB_SAT,
ISD::INTRINSIC_VOID,
ISD::INTRINSIC_W_CHAIN});
@@ -17436,10 +17438,10 @@ static bool isV2BF16(Type *Ty) {
}
/// \return true if atomicrmw integer ops work for the type.
-static bool isAtomicRMWLegalIntTy(Type *Ty) {
+static bool isAtomicRMWLegalIntTy(Type *Ty, bool Allow64 = true) {
if (auto *IT = dyn_cast<IntegerType>(Ty)) {
unsigned BW = IT->getBitWidth();
- return BW == 32 || BW == 64;
+ return BW == 32 || (BW == 64 && Allow64);
}
return false;
@@ -17491,8 +17493,8 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
/// \return Action to perform on AtomicRMWInsts for integer operations.
static TargetLowering::AtomicExpansionKind
-atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW) {
- return isAtomicRMWLegalIntTy(RMW->getType())
+atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW, bool Allow64 = true) {
+ return isAtomicRMWLegalIntTy(RMW->getType(), Allow64)
? TargetLowering::AtomicExpansionKind::None
: TargetLowering::AtomicExpansionKind::CmpXChg;
}
@@ -17561,6 +17563,9 @@ SITargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
return atomicSupportedIfLegalIntType(RMW);
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
+ return atomicSupportedIfLegalIntType(RMW, false);
case AtomicRMWInst::Sub:
case AtomicRMWInst::Or:
case AtomicRMWInst::Xor: {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 485ca78db93a7..8e68d2e5febe9 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -816,6 +816,8 @@ defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
defm atomic_load_uinc_wrap : SIAtomicM0Glue2 <"LOAD_UINC_WRAP">;
defm atomic_load_udec_wrap : SIAtomicM0Glue2 <"LOAD_UDEC_WRAP">;
+defm atomic_load_usub_cond : SIAtomicM0Glue2 <"LOAD_USUB_COND">;
+defm atomic_load_usub_sat : SIAtomicM0Glue2 <"LOAD_USUB_SAT">;
defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
defm atomic_load_min : SIAtomicM0Glue2 <"LOAD_MIN">;
defm atomic_load_max : SIAtomicM0Glue2 <"LOAD_MAX">;
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
index 6729faf233c35..d7862e8373121 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/MIR/atomics-gmir.mir
@@ -82,6 +82,12 @@ body: |
; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_UDEC_WRAP
%20:_(s32) = G_ATOMICRMW_UDEC_WRAP %1, %5
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_COND
+ %21:_(s32) = G_ATOMICRMW_USUB_COND %1, %5
+
+ ; CHECK: DIVERGENT: %{{[0-9]*}}: %{{[0-9]*}}:_(s32) = G_ATOMICRMW_USUB_SAT
+ %22:_(s32) = G_ATOMICRMW_USUB_SAT %1, %5
+
$vgpr0 = COPY %4(s32)
SI_RETURN implicit $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll
new file mode 100644
index 0000000000000..f897e2817ec66
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll
@@ -0,0 +1,482 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12-GISEL %s
+
+define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32(ptr %addr, i32 %in) {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr %addr, i32 -4
+ %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32_forced(ptr %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_no_rtn_u32_forced:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr %addr, i32 -4
+ %unused = atomicrmw usub_cond ptr %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr %use) {
+; GFX12-SDAG-LABEL: flat_atomic_usub_cond_rtn_u32:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_clause 0x1
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-SDAG-NEXT: flat_store_b32 v[0:1], v2
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: flat_atomic_usub_cond_rtn_u32:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-GISEL-NEXT: flat_store_b32 v[0:1], v2
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr %addr, i32 4
+ %val = atomicrmw usub_cond ptr %gep, i32 %in seq_cst
+ store i32 %val, ptr %use
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32(ptr addrspace(1) %addr, i32 %in) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
+ %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32_forced(ptr addrspace(1) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_no_rtn_u32_forced:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_no_rtn_u32_forced:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
+ %unused = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_cond_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_rtn_u32:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_rtn_u32:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_clause 0x1
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5]
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr addrspace(1) %addr, i32 4
+ %val = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %in seq_cst
+ store i32 %val, ptr addrspace(1) %use
+ ret void
+}
+
+define amdgpu_kernel void @ds_usub_cond_no_rtn_u32(ptr addrspace(3) %addr, i32 %in) {
+; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4
+ %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @ds_usub_cond_no_rtn_u32_forced(ptr addrspace(3) %addr, i32 %in) "target-features"="+atomic-csub-no-rtn-insts" {
+; GFX12-SDAG-LABEL: ds_usub_cond_no_rtn_u32_forced:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
+; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: ds_usub_cond_no_rtn_u32_forced:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
+; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
+; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr addrspace(3) %addr, i32 -4
+ %unused = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst
+ ret void
+}
+
+define amdgpu_kernel void @ds_usub_cond_rtn_u32(ptr addrspace(3) %addr, i32 %in, ptr addrspace(3) %use) {
+; GFX12-SDAG-LABEL: ds_usub_cond_rtn_u32:
+; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1 offset:16
+; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-SDAG-NEXT: ds_store_b32 v1, v0
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: ds_usub_cond_rtn_u32:
+; GFX12-GISEL: ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0
+; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v1, v0 offset:16
+; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT: ds_store_b32 v1, v0
+; GFX12-GISEL-NEXT: s_endpgm
+entry:
+ %gep = getelementptr i32, ptr addrspace(3) %addr, i32 4
+ %val = atomicrmw usub_cond ptr addrspace(3) %gep, i32 %in seq_cst
+ store i32 %val, ptr addrspace(3) %use
+ ret void
+}
+
+define i32 @global_atomic_usub_cond(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_cond:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
+ ret i32 %ret
+}
+
+define i32 @global_atomic_usub_cond_offset(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret i32 %ret
+}
+
+define void @global_atomic_usub_cond_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define void @global_atomic_usub_cond_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_offset_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_offset_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_sgpr_base_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_sgpr_base_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v0, off
+; GFX12-GISEL-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ store i32 %ret, ptr addrspace(1) undef
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX12-SDAG-LABEL: global_atomic_usub_cond_sgpr_base_offset_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_cond_sgpr_base_offset_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_cond ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+attributes #0 = { nounwind willreturn }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll
new file mode 100644
index 0000000000000..2e788d8eca50e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll
@@ -0,0 +1,487 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10-GISEL
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10-GISEL
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11-GISEL
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-GISEL
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10-SDAG
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10-SDAG
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11-SDAG
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-SDAG
+
+define i32 @global_atomic_usub_sat(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
+ ret i32 %ret
+}
+
+define i32 @global_atomic_usub_sat_offset(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_offset:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_offset:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_offset:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_offset:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret i32 %ret
+}
+
+define void @global_atomic_usub_sat_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_nortn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_nortn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_nortn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_nortn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define void @global_atomic_usub_sat_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_offset_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_sat_sgpr_base_offset(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x1000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: global_store_dword v[0:1], v0, off
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v0, off
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: global_store_b32 v[0:1], v0, off
+; GFX12-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0x1000
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v0, v1, s[0:1] glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: global_store_dword v[0:1], v0, off
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x1000 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v0, v1, s[0:1] glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v0, off
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: global_store_b32 v[0:1], v0, off
+; GFX12-SDAG-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ store i32 %ret, ptr addrspace(1) undef
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_sat_sgpr_base_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0x1000
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, s2
+; GFX10-GISEL-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
+; GFX11-GISEL-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX12-GISEL-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0x1000
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT: global_atomic_csub v0, v0, v1, s[0:1] glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0x1000 :: v_dual_mov_b32 v1, s2
+; GFX11-SDAG-NEXT: global_atomic_csub_u32 v0, v0, v1, s[0:1] glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX12-SDAG-NEXT: global_atomic_sub_clamp_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_endpgm
+ %gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+attributes #0 = { nounwind willreturn }
+attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
index 81f768f303ca1..021223df6f313 100644
--- a/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
+++ b/llvm/test/CodeGen/AMDGPU/cgp-addressing-modes-gfx1030.ll
@@ -7,12 +7,12 @@
define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(ptr addrspace(1) %out, ptr addrspace(1) %in) {
; OPT-LABEL: @test_sink_small_offset_global_atomic_csub_i32(
; OPT-NEXT: entry:
-; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR3:[0-9]+]]
+; OPT-NEXT: [[TID:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #[[ATTR2:[0-9]+]]
; OPT-NEXT: [[CMP:%.*]] = icmp eq i32 [[TID]], 0
; OPT-NEXT: br i1 [[CMP]], label [[ENDIF:%.*]], label [[IF:%.*]]
; OPT: if:
; OPT-NEXT: [[IN_GEP:%.*]] = getelementptr i32, ptr addrspace(1) [[IN:%.*]], i32 7
-; OPT-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) [[IN_GEP]], i32 2)
+; OPT-NEXT: [[VAL:%.*]] = atomicrmw usub_sat ptr addrspace(1) [[IN_GEP]], i32 2 seq_cst, align 4
; OPT-NEXT: br label [[ENDIF]]
; OPT: endif:
; OPT-NEXT: [[X:%.*]] = phi i32 [ [[VAL]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
@@ -35,10 +35,13 @@ define amdgpu_kernel void @test_sink_small_offset_global_atomic_csub_i32(ptr add
; GCN-NEXT: v_mov_b32_e32 v1, 2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] offset:28 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_gl1_inv
+; GCN-NEXT: buffer_gl0_inv
; GCN-NEXT: .LBB0_2: ; %endif
; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s4
; GCN-NEXT: v_mov_b32_e32 v1, 0x3d0800
-; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: global_store_dword v1, v0, s[0:1] offset:252
; GCN-NEXT: s_endpgm
entry:
@@ -48,7 +51,7 @@ entry:
if:
%in.gep = getelementptr i32, ptr addrspace(1) %in, i32 7
- %val = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %in.gep, i32 2)
+ %val = atomicrmw usub_sat ptr addrspace(1) %in.gep, i32 2 seq_cst
br label %endif
endif:
@@ -61,7 +64,6 @@ done:
ret void
}
-declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #0
declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
attributes #0 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll
index 79de55eb63bf8..bd0fd79bab834 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.gfx1030.ll
@@ -10,10 +10,12 @@ define amdgpu_ps float @global_csub_saddr_i32_rtn(ptr addrspace(1) inreg %sbase,
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_gl1_inv
+; GCN-NEXT: buffer_gl0_inv
; GCN-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %rtn = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep0, i32 %data)
+ %rtn = atomicrmw usub_sat ptr addrspace(1) %gep0, i32 %data seq_cst
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -23,11 +25,13 @@ define amdgpu_ps float @global_csub_saddr_i32_rtn_neg128(ptr addrspace(1) inreg
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] offset:-128 glc
; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_gl1_inv
+; GCN-NEXT: buffer_gl0_inv
; GCN-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %rtn = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep1, i32 %data)
+ %rtn = atomicrmw usub_sat ptr addrspace(1) %gep1, i32 %data seq_cst
%cast.rtn = bitcast i32 %rtn to float
ret float %cast.rtn
}
@@ -36,10 +40,13 @@ define amdgpu_ps void @global_csub_saddr_i32_nortn(ptr addrspace(1) inreg %sbase
; GCN-LABEL: global_csub_saddr_i32_nortn:
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_gl1_inv
+; GCN-NEXT: buffer_gl0_inv
; GCN-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
- %unused = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep0, i32 %data)
+ %unused = atomicrmw usub_sat ptr addrspace(1) %gep0, i32 %data seq_cst
ret void
}
@@ -47,11 +54,14 @@ define amdgpu_ps void @global_csub_saddr_i32_nortn_neg128(ptr addrspace(1) inreg
; GCN-LABEL: global_csub_saddr_i32_nortn_neg128:
; GCN: ; %bb.0:
; GCN-NEXT: global_atomic_csub v0, v0, v1, s[2:3] offset:-128 glc
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: buffer_gl1_inv
+; GCN-NEXT: buffer_gl0_inv
; GCN-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr addrspace(1) %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 -128
- %unused = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep1, i32 %data)
+ %unused = atomicrmw usub_sat ptr addrspace(1) %gep1, i32 %data seq_cst
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
index b05f141b76009..e7ff744ccc2d5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.atomic.csub.ll
@@ -2,14 +2,12 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1031 | FileCheck %s -check-prefixes=GCN,PREGFX12
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck %s -check-prefixes=GCN,GFX12PLUS
-declare i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1), i32)
-
; GCN-LABEL: {{^}}global_atomic_csub_rtn:
; PREGFX12: global_atomic_csub v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9:]+}}, s{{\[[0-9]+:[0-9]+\]}} glc
; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}} th:TH_ATOMIC_RETURN
define amdgpu_kernel void @global_atomic_csub_rtn(ptr addrspace(1) %ptr, i32 %data) {
main_body:
- %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst
ret void
}
@@ -18,7 +16,7 @@ main_body:
; GFX12PLUS: global_atomic_sub_clamp_u32 v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}
define amdgpu_kernel void @global_atomic_csub_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
main_body:
- %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %ptr, i32 %data)
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst
ret void
}
@@ -28,7 +26,7 @@ main_body:
define amdgpu_kernel void @global_atomic_csub_off4_rtn(ptr addrspace(1) %ptr, i32 %data) {
main_body:
%p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
- %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %p, i32 %data)
+ %ret = atomicrmw usub_sat ptr addrspace(1) %p, i32 %data seq_cst
ret void
}
@@ -38,7 +36,7 @@ main_body:
define amdgpu_kernel void @global_atomic_csub_off4_no_rtn(ptr addrspace(1) %ptr, i32 %data) #0 {
main_body:
%p = getelementptr i32, ptr addrspace(1) %ptr, i64 1
- %ret = call i32 @llvm.amdgcn.global.atomic.csub(ptr addrspace(1) %p, i32 %data)
+ %ret = atomicrmw usub_sat ptr addrspace(1) %p, i32 %data seq_cst
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
index 24a4d8fbde200..1b0c0f597ac1a 100644
--- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -632,3 +632,49 @@ define i32 @atomicrmw_dec_private_i32(ptr addrspace(5) %ptr) {
%result = atomicrmw udec_wrap ptr addrspace(5) %ptr, i32 4 seq_cst
ret i32 %result
}
+
+define i32 @atomicrmw_usub_cond_private_i32(ptr addrspace(5) %ptr) {
+; IR-LABEL: @atomicrmw_usub_cond_private_i32(
+; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-NEXT: [[TMP2:%.*]] = icmp uge i32 [[TMP1]], 4
+; IR-NEXT: [[TMP3:%.*]] = sub i32 [[TMP1]], 4
+; IR-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i32 [[TMP3]], i32 [[TMP1]]
+; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
+; IR-NEXT: ret i32 [[TMP1]]
+;
+; GCN-LABEL: atomicrmw_usub_cond_private_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_add_i32_e32 v2, vcc, -4, v1
+; GCN-NEXT: v_min_u32_e32 v2, v2, v1
+; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw usub_cond ptr addrspace(5) %ptr, i32 4 seq_cst
+ ret i32 %result
+}
+
+define i32 @atomicrmw_usub_sat_private_i32(ptr addrspace(5) %ptr) {
+; IR-LABEL: @atomicrmw_usub_sat_private_i32(
+; IR-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[PTR:%.*]], align 4
+; IR-NEXT: [[NEW:%.*]] = call i32 @llvm.usub.sat.i32(i32 [[TMP1]], i32 4)
+; IR-NEXT: store i32 [[NEW]], ptr addrspace(5) [[PTR]], align 4
+; IR-NEXT: ret i32 [[TMP1]]
+;
+; GCN-LABEL: atomicrmw_usub_sat_private_i32:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_max_u32_e32 v2, 4, v1
+; GCN-NEXT: v_add_i32_e32 v2, vcc, -4, v2
+; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen
+; GCN-NEXT: v_mov_b32_e32 v0, v1
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0)
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %result = atomicrmw usub_sat ptr addrspace(5) %ptr, i32 4 seq_cst
+ ret i32 %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
index d0377b426cc10..0940318ec3136 100644
--- a/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/shl_add_ptr_csub.ll
@@ -12,7 +12,7 @@ define i32 @shl_base_atomicrmw_global_atomic_csub_ptr(ptr addrspace(1) %out, ptr
%cast = ptrtoint ptr addrspace(1) %arrayidx0 to i64
%shl = shl i64 %cast, 2
%castback = inttoptr i64 %shl to ptr addrspace(1)
- %val = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %castback, i32 43)
+ %val = atomicrmw usub_sat ptr addrspace(1) %castback, i32 43 seq_cst
store volatile i64 %cast, ptr addrspace(1) %extra.use, align 4
ret i32 %val
}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll
index 19cb7d21471d9..43eba9d1f1dff 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16-system.ll
@@ -857,3 +857,236 @@ define bfloat @test_atomicrmw_xchg_bf16_flat_system_align4(ptr %ptr, bfloat %val
%res = atomicrmw xchg ptr %ptr, bfloat %value seq_cst, align 4
ret bfloat %res
}
+
+define i16 @test_atomicrmw_usub_cond_i16_global_system(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_global_system(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i16 [[TMP5]], i16 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i16 %value seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_cond_i16_global_system_align4(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_global_system_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i16 [[TMP3]], i16 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i16 %value seq_cst, align 4
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_cond_i16_flat_system(ptr %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_flat_system(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i16 [[TMP5]], i16 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i16 %value seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_cond_i16_flat_system_align4(ptr %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_flat_system_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i16 [[TMP3]], i16 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i16 %value seq_cst, align 4
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_global_system(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_global_system(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i16 %value seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_global_system_align4(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_global_system_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i16 %value seq_cst, align 4
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_flat_system(ptr %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_flat_system(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i16 %value seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_flat_system_align4(ptr %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_flat_system_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i16 %value seq_cst, align 4
+ ret i16 %res
+}
+
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
index c49909597c72c..67309d388f1f2 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i16.ll
@@ -1362,6 +1362,352 @@ define i16 @test_atomicrmw_add_i16_buffer_fat_agent_align4(ptr addrspace(7) %ptr
ret i16 %res
}
+define i16 @test_atomicrmw_usub_cond_i16_global_agent(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_global_agent(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i16 [[TMP5]], i16 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_cond_i16_global_agent_align4(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_global_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i16 [[TMP3]], i16 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_cond_i16_local(ptr addrspace(3) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_local(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i16 [[TMP5]], i16 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(3) %ptr, i16 %value syncscope("agent") seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_cond_i16_local_align4(ptr addrspace(3) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_local_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i16 [[TMP3]], i16 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(3) %ptr, i16 %value syncscope("agent") seq_cst, align 4
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_cond_i16_flat_agent(ptr %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_flat_agent(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i16 [[TMP5]], i16 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i16 %value syncscope("agent") seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_cond_i16_flat_agent_align4(ptr %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i16_flat_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i16 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i16 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i16 [[TMP3]], i16 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i16 %value syncscope("agent") seq_cst, align 4
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_global_agent(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_global_agent(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_global_agent_align4(ptr addrspace(1) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_global_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i16 %value syncscope("agent") seq_cst, align 4
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_local(ptr addrspace(3) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_local(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(3) %ptr, i16 %value seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_local_align4(ptr addrspace(3) %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_local_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(3) %ptr, i16 %value seq_cst, align 4
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_flat_agent(ptr %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_flat_agent(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 65535, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i16
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i16 %value syncscope("agent") seq_cst
+ ret i16 %res
+}
+
+define i16 @test_atomicrmw_usub_sat_i16_flat_agent_align4(ptr %ptr, i16 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i16_flat_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i16
+; CHECK-NEXT: [[NEW:%.*]] = call i16 @llvm.usub.sat.i16(i16 [[EXTRACTED]], i16 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i16 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -65536
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i16
+; CHECK-NEXT: ret i16 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i16 %value syncscope("agent") seq_cst, align 4
+ ret i16 %res
+}
+
!0 = !{}
!1 = !{!"foo", !"bar"}
!2 = !{!3}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll
index a1007bacd522f..5fe53dbbd9f01 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-agent.ll
@@ -639,6 +639,96 @@ define i32 @test_atomicrmw_udec_wrap_i32_global_agent__amdgpu_no_fine_grained_me
ret i32 %res
}
+;---------------------------------------------------------------------
+; atomicrmw usub_cond
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i32 @test_atomicrmw_usub_cond_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_cond_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_cond ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+ ret i32 %res
+}
+
+define i32 @test_atomicrmw_usub_cond_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_cond_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_cond ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret i32 %res
+}
+
+define i32 @test_atomicrmw_usub_cond_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_cond_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_cond ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret i32 %res
+}
+
+define i32 @test_atomicrmw_usub_cond_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_cond_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_cond ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret i32 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw usub_sat
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i32 @test_atomicrmw_usub_sat_i32_global_agent(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_sat_i32_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_sat ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst
+ ret i32 %res
+}
+
+define i32 @test_atomicrmw_usub_sat_i32_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_sat_i32_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_sat ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret i32 %res
+}
+
+define i32 @test_atomicrmw_usub_sat_i32_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_sat_i32_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_sat ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret i32 %res
+}
+
+define i32 @test_atomicrmw_usub_sat_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_sat_i32_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_sat ptr addrspace(1) [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret i32 %res
+}
+
!0 = !{}
;.
; GFX803: [[META0]] = !{}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll
index 08288848efd66..e5424b8bb3449 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i32-system.ll
@@ -799,6 +799,96 @@ define i32 @test_atomicrmw_udec_wrap_i32_global_system__amdgpu_no_fine_grained_m
ret i32 %res
}
+;---------------------------------------------------------------------
+; atomicrmw usub_cond
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i32 @test_atomicrmw_usub_cond_i32_global_system(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_cond_i32_global_system(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_cond ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %value seq_cst
+ ret i32 %res
+}
+
+define i32 @test_atomicrmw_usub_cond_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_cond_i32_global_system__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_cond ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret i32 %res
+}
+
+define i32 @test_atomicrmw_usub_cond_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_cond_i32_global_system__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_cond ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
+ ret i32 %res
+}
+
+define i32 @test_atomicrmw_usub_cond_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_cond_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_cond ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret i32 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw usub_sat
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i32 @test_atomicrmw_usub_sat_i32_global_system(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_sat_i32_global_system(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_sat ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %value seq_cst
+ ret i32 %res
+}
+
+define i32 @test_atomicrmw_usub_sat_i32_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_sat_i32_global_system__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_sat ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]]
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret i32 %res
+}
+
+define i32 @test_atomicrmw_usub_sat_i32_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_sat_i32_global_system__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_sat ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.remote.memory !0
+ ret i32 %res
+}
+
+define i32 @test_atomicrmw_usub_sat_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i32 %value) {
+; COMMON-LABEL: define i32 @test_atomicrmw_usub_sat_i32_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[NEWLOADED:%.*]] = atomicrmw usub_sat ptr addrspace(1) [[PTR]], i32 [[VALUE]] seq_cst, align 4, !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.no.remote.memory [[META0]]
+; COMMON-NEXT: ret i32 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret i32 %res
+}
+
!0 = !{}
;.
; GFX803: [[META0]] = !{}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll
index 7586a0af43c95..7d15aad08d91c 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-agent.ll
@@ -639,6 +639,176 @@ define i64 @test_atomicrmw_udec_wrap_i64_global_agent__amdgpu_no_fine_grained_me
ret i64 %res
}
+;---------------------------------------------------------------------
+; atomicrmw usub_cond
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i64 @test_atomicrmw_usub_cond_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_cond_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[TMP2:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[TMP3:%.*]] = sub i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[TMP3]], i64 [[LOADED]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+ ret i64 %res
+}
+
+define i64 @test_atomicrmw_usub_cond_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_cond_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[TMP2:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[TMP3:%.*]] = sub i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[TMP3]], i64 [[LOADED]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret i64 %res
+}
+
+define i64 @test_atomicrmw_usub_cond_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_cond_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[TMP2:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[TMP3:%.*]] = sub i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[TMP3]], i64 [[LOADED]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret i64 %res
+}
+
+define i64 @test_atomicrmw_usub_cond_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_cond_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[TMP2:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[TMP3:%.*]] = sub i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[TMP3]], i64 [[LOADED]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret i64 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw usub_sat
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i64 @test_atomicrmw_usub_sat_i64_global_agent(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_sat_i64_global_agent(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[NEW:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[LOADED]], i64 [[VALUE]])
+; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst
+ ret i64 %res
+}
+
+define i64 @test_atomicrmw_usub_sat_i64_global_agent__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_sat_i64_global_agent__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[NEW:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[LOADED]], i64 [[VALUE]])
+; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret i64 %res
+}
+
+define i64 @test_atomicrmw_usub_sat_i64_global_agent__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_sat_i64_global_agent__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[NEW:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[LOADED]], i64 [[VALUE]])
+; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret i64 %res
+}
+
+define i64 @test_atomicrmw_usub_sat_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_sat_i64_global_agent__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[NEW:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[LOADED]], i64 [[VALUE]])
+; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret i64 %res
+}
+
!0 = !{}
;.
; GFX803: [[META0]] = !{}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll
index 4f3979f25076e..11bbc0193cc3d 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i64-system.ll
@@ -799,6 +799,176 @@ define i64 @test_atomicrmw_udec_wrap_i64_global_system__amdgpu_no_fine_grained_m
ret i64 %res
}
+;---------------------------------------------------------------------
+; atomicrmw usub_cond
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i64 @test_atomicrmw_usub_cond_i64_global_system(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_cond_i64_global_system(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[TMP2:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[TMP3:%.*]] = sub i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[TMP3]], i64 [[LOADED]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i64 %value seq_cst
+ ret i64 %res
+}
+
+define i64 @test_atomicrmw_usub_cond_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_cond_i64_global_system__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[TMP2:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[TMP3:%.*]] = sub i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[TMP3]], i64 [[LOADED]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret i64 %res
+}
+
+define i64 @test_atomicrmw_usub_cond_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_cond_i64_global_system__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[TMP2:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[TMP3:%.*]] = sub i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[TMP3]], i64 [[LOADED]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
+ ret i64 %res
+}
+
+define i64 @test_atomicrmw_usub_cond_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_cond_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[TMP2:%.*]] = icmp uge i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[TMP3:%.*]] = sub i64 [[LOADED]], [[VALUE]]
+; COMMON-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i64 [[TMP3]], i64 [[LOADED]]
+; COMMON-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP4]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret i64 %res
+}
+
+;---------------------------------------------------------------------
+; atomicrmw usub_sat
+;---------------------------------------------------------------------
+
+; expansion is necessary, operation not supported over PCIe
+define i64 @test_atomicrmw_usub_sat_i64_global_system(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_sat_i64_global_system(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[NEW:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[LOADED]], i64 [[VALUE]])
+; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i64 %value seq_cst
+ ret i64 %res
+}
+
+define i64 @test_atomicrmw_usub_sat_i64_global_system__amdgpu_no_fine_grained_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_sat_i64_global_system__amdgpu_no_fine_grained_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[NEW:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[LOADED]], i64 [[VALUE]])
+; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret i64 %res
+}
+
+define i64 @test_atomicrmw_usub_sat_i64_global_system__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_sat_i64_global_system__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[NEW:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[LOADED]], i64 [[VALUE]])
+; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.remote.memory !0
+ ret i64 %res
+}
+
+define i64 @test_atomicrmw_usub_sat_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(1) %ptr, i64 %value) {
+; COMMON-LABEL: define i64 @test_atomicrmw_usub_sat_i64_global_system__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(
+; COMMON-SAME: ptr addrspace(1) [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; COMMON-NEXT: [[TMP1:%.*]] = load i64, ptr addrspace(1) [[PTR]], align 8
+; COMMON-NEXT: br label [[ATOMICRMW_START:%.*]]
+; COMMON: atomicrmw.start:
+; COMMON-NEXT: [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; COMMON-NEXT: [[NEW:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[LOADED]], i64 [[VALUE]])
+; COMMON-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i64 [[LOADED]], i64 [[NEW]] seq_cst seq_cst, align 8
+; COMMON-NEXT: [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP2]], 1
+; COMMON-NEXT: [[NEWLOADED]] = extractvalue { i64, i1 } [[TMP2]], 0
+; COMMON-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; COMMON: atomicrmw.end:
+; COMMON-NEXT: ret i64 [[NEWLOADED]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i64 %value seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret i64 %res
+}
+
!0 = !{}
;.
; GFX803: [[META0]] = !{}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll
index 088371f461ec1..6c0ee795eafde 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8-system.ll
@@ -814,3 +814,371 @@ define i8 @test_atomicrmw_dec_i8_flat_system_align4(ptr %ptr, i8 %value) {
%res = atomicrmw udec_wrap ptr %ptr, i8 %value seq_cst, align 4
ret i8 %res
}
+
+define i8 @test_atomicrmw_usub_cond_i8_global_system(ptr addrspace(1) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_global_system(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i8 %value seq_cst
+ ret i8 %res
+}
+
+define i8 @test_atomicrmw_usub_cond_i8_global_system_align2(ptr addrspace(1) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_global_system_align2(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i8 %value seq_cst, align 2
+ ret i8 %res
+}
+
+define i8 @test_atomicrmw_usub_cond_i8_global_system_align4(ptr addrspace(1) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_global_system_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i8 [[TMP3]], i8 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i8 %value seq_cst, align 4
+ ret i8 %res
+}
+
+define i8 @test_atomicrmw_usub_cond_i8_flat_system(ptr %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_flat_system(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i8 %value seq_cst
+ ret i8 %res
+}
+
+define i8 @test_atomicrmw_usub_cond_i8_flat_system_align2(ptr %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_flat_system_align2(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i8 %value seq_cst, align 2
+ ret i8 %res
+}
+
+define i8 @test_atomicrmw_usub_cond_i8_flat_system_align4(ptr %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_flat_system_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i8 [[TMP3]], i8 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i8 %value seq_cst, align 4
+ ret i8 %res
+}
+
+define i8 @test_atomicrmw_usub_sat_i8_global_system(ptr addrspace(1) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_global_system(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i8 %value seq_cst
+ ret i8 %res
+}
+
+define i8 @test_atomicrmw_usub_sat_i8_global_system_align2(ptr addrspace(1) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_global_system_align2(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i8 %value seq_cst, align 2
+ ret i8 %res
+}
+
+define i8 @test_atomicrmw_usub_sat_i8_global_system_align4(ptr addrspace(1) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_global_system_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i8 %value seq_cst, align 4
+ ret i8 %res
+}
+
+define i8 @test_atomicrmw_usub_sat_i8_flat_system(ptr %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_flat_system(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i8 %value seq_cst
+ ret i8 %res
+}
+
+define i8 @test_atomicrmw_usub_sat_i8_flat_system_align2(ptr %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_flat_system_align2(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; CHECK-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i8 %value seq_cst, align 2
+ ret i8 %res
+}
+
+define i8 @test_atomicrmw_usub_sat_i8_flat_system_align4(ptr %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_flat_system_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i8 %value seq_cst, align 4
+ ret i8 %res
+}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
index 590ee63001615..42ace87e526b9 100644
--- a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomic-i8.ll
@@ -1712,3 +1712,775 @@ define i8 @test_atomicrmw_add_i8_buffer_fat_agent_align4(ptr addrspace(7) %ptr,
%res = atomicrmw add ptr addrspace(7) %ptr, i8 %value syncscope("agent") seq_cst, align 4
ret i8 %res
}
+
+define i8 @test_atomicrmw_usub_cond_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_cond_i8_global_agent(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_cond_i8_global_agent(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_global_agent_align2(ptr addrspace(1) %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_cond_i8_global_agent_align2(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_cond_i8_global_agent_align2(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_global_agent_align4(ptr addrspace(1) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_global_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i8 [[TMP3]], i8 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 4
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_local(ptr addrspace(3) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_local(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(3) %ptr, i8 %value seq_cst
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_local_align2(ptr addrspace(3) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_local_align2(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP6:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(3) %ptr, i8 %value seq_cst, align 2
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_local_align4(ptr addrspace(3) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_local_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i8 [[TMP3]], i8 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr addrspace(3) %ptr, i8 %value seq_cst, align 4
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_flat_agent(ptr %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_cond_i8_flat_agent(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_cond_i8_flat_agent(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i8 %value syncscope("agent") seq_cst
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_flat_agent_align2(ptr %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_cond_i8_flat_agent_align2(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; GCN-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; GCN-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_cond_i8_flat_agent_align2(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[TMP4:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; R600-NEXT: [[TMP5:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; R600-NEXT: [[NEW:%.*]] = select i1 [[TMP4]], i8 [[TMP5]], i8 [[EXTRACTED]]
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP6:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP6]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP6]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i8 %value syncscope("agent") seq_cst, align 2
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_cond_i8_flat_agent_align4(ptr %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_cond_i8_flat_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[TMP2:%.*]] = icmp uge i8 [[EXTRACTED]], [[VALUE:%.*]]
+; CHECK-NEXT: [[TMP3:%.*]] = sub i8 [[EXTRACTED]], [[VALUE]]
+; CHECK-NEXT: [[NEW:%.*]] = select i1 [[TMP2]], i8 [[TMP3]], i8 [[EXTRACTED]]
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_cond ptr %ptr, i8 %value syncscope("agent") seq_cst, align 4
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_global_agent(ptr addrspace(1) %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_sat_i8_global_agent(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_sat_i8_global_agent(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_global_agent_align2(ptr addrspace(1) %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_sat_i8_global_agent_align2(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i64(ptr addrspace(1) [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_sat_i8_global_agent_align2(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(1) @llvm.ptrmask.p1.i32(ptr addrspace(1) [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(1) [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(1) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 2
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_global_agent_align4(ptr addrspace(1) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_global_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(1) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(1) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(1) %ptr, i8 %value syncscope("agent") seq_cst, align 4
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_local(ptr addrspace(3) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_local(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(3) %ptr, i8 %value seq_cst
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_local_align2(ptr addrspace(3) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_local_align2(
+; CHECK-NEXT: [[ALIGNEDADDR:%.*]] = call ptr addrspace(3) @llvm.ptrmask.p3.i32(ptr addrspace(3) [[PTR:%.*]], i32 -4)
+; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[PTR]] to i32
+; CHECK-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; CHECK-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; CHECK-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; CHECK-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(3) [[ALIGNEDADDR]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; CHECK-NEXT: [[TMP4:%.*]] = cmpxchg ptr addrspace(3) [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; CHECK-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(3) %ptr, i8 %value seq_cst, align 2
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_local_align4(ptr addrspace(3) %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_local_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(3) [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr addrspace(3) [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr addrspace(3) %ptr, i8 %value seq_cst, align 4
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_flat_agent(ptr %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_sat_i8_flat_agent(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_sat_i8_flat_agent(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i8 %value syncscope("agent") seq_cst
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_flat_agent_align2(ptr %ptr, i8 %value) {
+; GCN-LABEL: @test_atomicrmw_usub_sat_i8_flat_agent_align2(
+; GCN-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i64(ptr [[PTR:%.*]], i64 -4)
+; GCN-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i64
+; GCN-NEXT: [[PTRLSB:%.*]] = and i64 [[TMP1]], 3
+; GCN-NEXT: [[TMP2:%.*]] = shl i64 [[PTRLSB]], 3
+; GCN-NEXT: [[SHIFTAMT:%.*]] = trunc i64 [[TMP2]] to i32
+; GCN-NEXT: [[MASK:%.*]] = shl i32 255, [[SHIFTAMT]]
+; GCN-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; GCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; GCN-NEXT: br label [[ATOMICRMW_START:%.*]]
+; GCN: atomicrmw.start:
+; GCN-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; GCN-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; GCN-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; GCN-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; GCN-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[SHIFTAMT]]
+; GCN-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; GCN-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; GCN-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; GCN-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GCN-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GCN-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; GCN: atomicrmw.end:
+; GCN-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[SHIFTAMT]]
+; GCN-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; GCN-NEXT: ret i8 [[EXTRACTED3]]
+;
+; R600-LABEL: @test_atomicrmw_usub_sat_i8_flat_agent_align2(
+; R600-NEXT: [[ALIGNEDADDR:%.*]] = call ptr @llvm.ptrmask.p0.i32(ptr [[PTR:%.*]], i32 -4)
+; R600-NEXT: [[TMP1:%.*]] = ptrtoint ptr [[PTR]] to i32
+; R600-NEXT: [[PTRLSB:%.*]] = and i32 [[TMP1]], 3
+; R600-NEXT: [[TMP2:%.*]] = shl i32 [[PTRLSB]], 3
+; R600-NEXT: [[MASK:%.*]] = shl i32 255, [[TMP2]]
+; R600-NEXT: [[INV_MASK:%.*]] = xor i32 [[MASK]], -1
+; R600-NEXT: [[TMP3:%.*]] = load i32, ptr [[ALIGNEDADDR]], align 4
+; R600-NEXT: br label [[ATOMICRMW_START:%.*]]
+; R600: atomicrmw.start:
+; R600-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP3]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; R600-NEXT: [[SHIFTED:%.*]] = lshr i32 [[LOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[SHIFTED]] to i8
+; R600-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; R600-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; R600-NEXT: [[SHIFTED1:%.*]] = shl nuw i32 [[EXTENDED]], [[TMP2]]
+; R600-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], [[INV_MASK]]
+; R600-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[SHIFTED1]]
+; R600-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[ALIGNEDADDR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; R600-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; R600-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP4]], 0
+; R600-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; R600: atomicrmw.end:
+; R600-NEXT: [[SHIFTED2:%.*]] = lshr i32 [[NEWLOADED]], [[TMP2]]
+; R600-NEXT: [[EXTRACTED3:%.*]] = trunc i32 [[SHIFTED2]] to i8
+; R600-NEXT: ret i8 [[EXTRACTED3]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i8 %value syncscope("agent") seq_cst, align 2
+ ret i8 %res
+ }
+
+define i8 @test_atomicrmw_usub_sat_i8_flat_agent_align4(ptr %ptr, i8 %value) {
+; CHECK-LABEL: @test_atomicrmw_usub_sat_i8_flat_agent_align4(
+; CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT: br label [[ATOMICRMW_START:%.*]]
+; CHECK: atomicrmw.start:
+; CHECK-NEXT: [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[NEWLOADED:%.*]], [[ATOMICRMW_START]] ]
+; CHECK-NEXT: [[EXTRACTED:%.*]] = trunc i32 [[LOADED]] to i8
+; CHECK-NEXT: [[NEW:%.*]] = call i8 @llvm.usub.sat.i8(i8 [[EXTRACTED]], i8 [[VALUE:%.*]])
+; CHECK-NEXT: [[EXTENDED:%.*]] = zext i8 [[NEW]] to i32
+; CHECK-NEXT: [[UNMASKED:%.*]] = and i32 [[LOADED]], -256
+; CHECK-NEXT: [[INSERTED:%.*]] = or i32 [[UNMASKED]], [[EXTENDED]]
+; CHECK-NEXT: [[TMP2:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[INSERTED]] syncscope("agent") seq_cst seq_cst, align 4
+; CHECK-NEXT: [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP2]], 1
+; CHECK-NEXT: [[NEWLOADED]] = extractvalue { i32, i1 } [[TMP2]], 0
+; CHECK-NEXT: br i1 [[SUCCESS]], label [[ATOMICRMW_END:%.*]], label [[ATOMICRMW_START]]
+; CHECK: atomicrmw.end:
+; CHECK-NEXT: [[EXTRACTED1:%.*]] = trunc i32 [[NEWLOADED]] to i8
+; CHECK-NEXT: ret i8 [[EXTRACTED1]]
+;
+ %res = atomicrmw usub_sat ptr %ptr, i8 %value syncscope("agent") seq_cst, align 4
+ ret i8 %res
+}
>From 59616ea5ab5153c65276e982c921faaf2a85a765 Mon Sep 17 00:00:00 2001
From: Andrew Jenner <Andrew.Jenner at amd.com>
Date: Thu, 22 May 2025 10:56:27 -0400
Subject: [PATCH 2/4] Don't use HasAtomicCSubNoRtnInsts for cond_sub
instructions.
---
llvm/lib/Target/AMDGPU/DSInstructions.td | 17 ++++-----
llvm/lib/Target/AMDGPU/FLATInstructions.td | 6 ++--
.../CodeGen/AMDGPU/atomicrmw_usub_cond.ll | 36 +++++++++----------
3 files changed, 27 insertions(+), 32 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 1198051de793c..979fdc8632330 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1138,27 +1138,24 @@ multiclass DSAtomicRetNoRetPatCondSub_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
let OtherPredicates = [LDSRequiresM0Init] in {
def : DSAtomicRetPat<inst, vt,
!cast<PatFrag>(frag#"_local_m0_"#vt)>;
- let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
}
let OtherPredicates = [NotLDSRequiresM0Init] in {
def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
!cast<PatFrag>(frag#"_local_"#vt)>;
- let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
+ def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
+ !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
}
let OtherPredicates = [HasGDS] in {
def : DSAtomicRetPat<inst, vt,
!cast<PatFrag>(frag#"_region_m0_"#vt),
/* complexity */ 0, /* gds */ 1>;
- let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
- /* complexity */ 1, /* gds */ 1>;
+ def : DSAtomicRetPat<noRetInst, vt,
+ !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
+ /* complexity */ 1, /* gds */ 1>;
}
}
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index b3b76f99f3571..708f6b5529cb6 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1822,7 +1822,7 @@ defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
let OtherPredicates = [isGFX12Plus] in
defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-let OtherPredicates = [isGFX12Plus, HasAtomicCSubNoRtnInsts] in
+let OtherPredicates = [isGFX12Plus] in
defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
let OtherPredicates = [HasD16LoadStore] in {
@@ -1990,9 +1990,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i
let SubtargetPredicate = isGFX12Plus in {
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
-
- let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
+ defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_global", i32>;
}
let OtherPredicates = [isGFX12PlusNot12_50] in
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll
index f897e2817ec66..f3db599a06dcd 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll
@@ -115,8 +115,8 @@ define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32(ptr addrspace(1) %
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
@@ -127,8 +127,8 @@ define amdgpu_kernel void @global_atomic_usub_cond_no_rtn_u32(ptr addrspace(1) %
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
@@ -212,7 +212,7 @@ define amdgpu_kernel void @ds_usub_cond_no_rtn_u32(ptr addrspace(3) %addr, i32 %
; GFX12-SDAG-NEXT: s_add_co_i32 s0, s0, -16
; GFX12-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
-; GFX12-SDAG-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1
+; GFX12-SDAG-NEXT: ds_cond_sub_u32 v0, v1
; GFX12-SDAG-NEXT: s_wait_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SE
; GFX12-SDAG-NEXT: s_endpgm
@@ -224,7 +224,7 @@ define amdgpu_kernel void @ds_usub_cond_no_rtn_u32(ptr addrspace(3) %addr, i32 %
; GFX12-GISEL-NEXT: s_add_co_u32 s0, s0, -16
; GFX12-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v0, s0
-; GFX12-GISEL-NEXT: ds_cond_sub_rtn_u32 v0, v0, v1
+; GFX12-GISEL-NEXT: ds_cond_sub_u32 v0, v1
; GFX12-GISEL-NEXT: s_wait_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SE
; GFX12-GISEL-NEXT: s_endpgm
@@ -369,8 +369,8 @@ define void @global_atomic_usub_cond_nortn(ptr addrspace(1) %ptr, i32 %data) {
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -383,8 +383,8 @@ define void @global_atomic_usub_cond_nortn(ptr addrspace(1) %ptr, i32 %data) {
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v[0:1], v2, off scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -401,8 +401,8 @@ define void @global_atomic_usub_cond_offset_nortn(ptr addrspace(1) %ptr, i32 %da
; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v[0:1], v2, off offset:4096 scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
@@ -415,8 +415,8 @@ define void @global_atomic_usub_cond_offset_nortn(ptr addrspace(1) %ptr, i32 %da
; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v[0:1], v2, off offset:4096 scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -459,8 +459,8 @@ define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset_nortn(ptr ad
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:4096 scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
; GFX12-SDAG-NEXT: s_endpgm
;
@@ -469,8 +469,8 @@ define amdgpu_kernel void @global_atomic_usub_cond_sgpr_base_offset_nortn(ptr ad
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:4096 scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX12-GISEL-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
>From 72fdd50aabd2e21ba1bbe167afe2d6698be18809 Mon Sep 17 00:00:00 2001
From: Andrew Jenner <Andrew.Jenner at amd.com>
Date: Thu, 24 Jul 2025 08:10:14 -0400
Subject: [PATCH 3/4] Review feedback.
---
llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 6 +-
llvm/lib/Target/AMDGPU/AMDGPUGISel.td | 1 +
.../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 15 +
.../AMDGPU/AMDGPULowerBufferFatPointers.cpp | 4 +-
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 4 +
llvm/lib/Target/AMDGPU/BUFInstructions.td | 2 +-
llvm/lib/Target/AMDGPU/DSInstructions.td | 28 +-
llvm/lib/Target/AMDGPU/FLATInstructions.td | 3 +-
llvm/lib/Target/AMDGPU/R600ISelLowering.cpp | 10 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 29 +-
llvm/lib/Target/AMDGPU/SIInstructions.td | 1 +
.../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 2 +
.../llvm.amdgcn.global.atomic.csub.ll | 83 +-
.../CodeGen/AMDGPU/atomicrmw_usub_cond.ll | 36 +-
.../test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll | 2311 ++++++++++++++++-
llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll | 40 +-
.../buffer-fat-pointer-atomicrmw-usub_cond.ll | 115 +
.../buffer-fat-pointer-atomicrmw-usub_sat.ll | 70 +
.../CodeGen/AMDGPU/test_isel_single_lane.ll | 16 +-
19 files changed, 2662 insertions(+), 114 deletions(-)
create mode 100644 llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll
create mode 100644 llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_sat.ll
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index a6555ef77ab28..8e180269453f3 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1588,6 +1588,7 @@ def int_amdgcn_raw_buffer_atomic_xor : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_inc : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_dec : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_cond_sub_u32 : AMDGPURawBufferAtomic;
+def int_amdgcn_raw_buffer_atomic_sub_clamp_u32 : AMDGPURawBufferAtomic;
def int_amdgcn_raw_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
@@ -1624,9 +1625,8 @@ def int_amdgcn_raw_ptr_buffer_atomic_or : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_xor : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_inc : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_dec : AMDGPURawPtrBufferAtomic;
-def int_amdgcn_raw_ptr_buffer_atomic_usub_cond : AMDGPURawPtrBufferAtomic;
-def int_amdgcn_raw_ptr_buffer_atomic_usub_sat : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cond_sub_u32 : AMDGPURawPtrBufferAtomic;
+def int_amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32 : AMDGPURawPtrBufferAtomic;
def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
@@ -1668,6 +1668,7 @@ def int_amdgcn_struct_buffer_atomic_xor : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_inc : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_dec : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_cond_sub_u32 : AMDGPUStructBufferAtomic;
+def int_amdgcn_struct_buffer_atomic_sub_clamp_u32 : AMDGPUStructBufferAtomic;
def int_amdgcn_struct_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
@@ -1704,6 +1705,7 @@ def int_amdgcn_struct_ptr_buffer_atomic_xor : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_inc : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_dec : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_cond_sub_u32 : AMDGPUStructPtrBufferAtomic;
+def int_amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32 : AMDGPUStructPtrBufferAtomic;
def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
[llvm_anyint_ty],
[LLVMMatchType<0>, // src(VGPR)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 7c5a06a46d0b1..ba20dfc8514d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -310,6 +310,7 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32, SIbuffer_atomic_cond_sub_u32>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32, SIbuffer_atomic_csub>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_SBYTE, SIsbuffer_load_byte>;
def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD_UBYTE, SIsbuffer_load_ubyte>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 040cdf1d3b775..1fffbea851418 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6294,8 +6294,15 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX;
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
+ return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32;
case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32;
default:
llvm_unreachable("unhandled atomic opcode");
@@ -7637,6 +7644,14 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_buffer_atomic_fmax:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fmax:
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index 7d5a390a66162..b91200df0da6f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1743,10 +1743,10 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin;
break;
case AtomicRMWInst::USubCond:
- IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_cond;
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32;
break;
case AtomicRMWInst::USubSat:
- IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_usub_sat;
+ IID = Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32;
break;
case AtomicRMWInst::FSub: {
reportFatalUsageError(
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 5276c96433b53..ddfb0a117fd49 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3124,6 +3124,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
@@ -4462,6 +4464,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32:
+ case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index f99e71637f70f..8ad438f078577 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1109,7 +1109,7 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
let OtherPredicates = [HasGFX10_BEncoding] in {
defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics <
- "buffer_atomic_csub", VGPR_32, i32, int_amdgcn_global_atomic_csub
+ "buffer_atomic_csub", VGPR_32, i32
>;
}
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index 979fdc8632330..fc2be4bb5e8fa 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1133,32 +1133,6 @@ multiclass DSAtomicRetNoRetPat_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
}
}
-multiclass DSAtomicRetNoRetPatCondSub_mc<DS_Pseudo inst, DS_Pseudo noRetInst,
- ValueType vt, string frag> {
- let OtherPredicates = [LDSRequiresM0Init] in {
- def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_local_m0_"#vt)>;
- def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_local_m0_noret_"#vt), /* complexity */ 1>;
- }
-
- let OtherPredicates = [NotLDSRequiresM0Init] in {
- def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(inst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_"#vt)>;
- def : DSAtomicRetPat<!cast<DS_Pseudo>(!cast<string>(noRetInst)#"_gfx9"), vt,
- !cast<PatFrag>(frag#"_local_noret_"#vt), /* complexity */ 1>;
- }
-
- let OtherPredicates = [HasGDS] in {
- def : DSAtomicRetPat<inst, vt,
- !cast<PatFrag>(frag#"_region_m0_"#vt),
- /* complexity */ 0, /* gds */ 1>;
- def : DSAtomicRetPat<noRetInst, vt,
- !cast<PatFrag>(frag#"_region_m0_noret_"#vt),
- /* complexity */ 1, /* gds */ 1>;
- }
-}
-
let SubtargetPredicate = isGFX6GFX7GFX8GFX9GFX10 in {
// Caution, the order of src and cmp is the *opposite* of the BUFFER_ATOMIC_CMPSWAP opcode.
class DSAtomicCmpXChgSwapped<DS_Pseudo inst, ValueType vt, PatFrag frag,
@@ -1242,7 +1216,7 @@ defm : DSAtomicRetNoRetPat_mc<DS_PK_ADD_RTN_BF16, DS_PK_ADD_BF16, v2bf16, "atomi
let SubtargetPredicate = isGFX12Plus in {
-defm : DSAtomicRetNoRetPatCondSub_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">;
+defm : DSAtomicRetNoRetPat_mc<DS_COND_SUB_RTN_U32, DS_COND_SUB_U32, i32, "atomic_load_usub_cond">;
defm : DSAtomicRetNoRetPat_mc<DS_SUB_CLAMP_RTN_U32, DS_SUB_CLAMP_U32, i32, "atomic_load_usub_sat">;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 708f6b5529cb6..b87230a097161 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1809,8 +1809,7 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
let SubtargetPredicate = isGFX12Plus in {
defm : FlatAtomicRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_" #as, i32 >;
- let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
- defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>;
+ defm : FlatAtomicNoRtnPat<"FLAT_ATOMIC_COND_SUB_U32", "atomic_load_usub_cond_"#as, i32>;
}
} // end foreach as
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index e452e206dad2d..52d51315301e5 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -2185,20 +2185,14 @@ R600TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *RMW) const {
case AtomicRMWInst::FSub:
case AtomicRMWInst::FMax:
case AtomicRMWInst::FMin:
+ case AtomicRMWInst::USubCond:
+ case AtomicRMWInst::USubSat:
return AtomicExpansionKind::CmpXChg;
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
// FIXME: Cayman at least appears to have instructions for this, but the
// instruction defintions appear to be missing.
return AtomicExpansionKind::CmpXChg;
- case AtomicRMWInst::USubCond:
- case AtomicRMWInst::USubSat:
- if (auto *IntTy = dyn_cast<IntegerType>(RMW->getType())) {
- unsigned Size = IntTy->getBitWidth();
- if (Size == 32)
- return AtomicExpansionKind::None;
- }
- return AtomicExpansionKind::CmpXChg;
case AtomicRMWInst::Xchg: {
const DataLayout &DL = RMW->getFunction()->getDataLayout();
unsigned ValSize = DL.getTypeSizeInBits(RMW->getType());
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 9aac5fc2152a3..a340dd2e99ffb 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9768,9 +9768,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_raw_buffer_atomic_dec:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_dec:
return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
- case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
- return lowerRawBufferAtomicIntrin(Op, DAG,
- AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
case Intrinsic::amdgcn_struct_buffer_atomic_swap:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_swap:
return lowerStructBufferAtomicIntrin(Op, DAG,
@@ -9812,10 +9809,21 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
case Intrinsic::amdgcn_struct_buffer_atomic_dec:
case Intrinsic::amdgcn_struct_ptr_buffer_atomic_dec:
return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_DEC);
+ case Intrinsic::amdgcn_raw_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_sub_clamp_u32:
+ return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_CSUB);
+ case Intrinsic::amdgcn_struct_buffer_atomic_sub_clamp_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_sub_clamp_u32:
+ return lowerStructBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_CSUB);
+ case Intrinsic::amdgcn_raw_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cond_sub_u32:
+ return lowerRawBufferAtomicIntrin(Op, DAG,
+ AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
case Intrinsic::amdgcn_struct_buffer_atomic_cond_sub_u32:
+ case Intrinsic::amdgcn_struct_ptr_buffer_atomic_cond_sub_u32:
return lowerStructBufferAtomicIntrin(Op, DAG,
AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32);
-
case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
case Intrinsic::amdgcn_raw_ptr_buffer_atomic_cmpswap: {
SDValue Rsrc = bufferRsrcPtrToVector(Op.getOperand(4), DAG);
@@ -17438,10 +17446,10 @@ static bool isV2BF16(Type *Ty) {
}
/// \return true if atomicrmw integer ops work for the type.
-static bool isAtomicRMWLegalIntTy(Type *Ty, bool Allow64 = true) {
+static bool isAtomicRMWLegalIntTy(Type *Ty) {
if (auto *IT = dyn_cast<IntegerType>(Ty)) {
unsigned BW = IT->getBitWidth();
- return BW == 32 || (BW == 64 && Allow64);
+ return BW == 32 || BW == 64;
}
return false;
@@ -17494,7 +17502,14 @@ static bool globalMemoryFPAtomicIsLegal(const GCNSubtarget &Subtarget,
/// \return Action to perform on AtomicRMWInsts for integer operations.
static TargetLowering::AtomicExpansionKind
atomicSupportedIfLegalIntType(const AtomicRMWInst *RMW, bool Allow64 = true) {
- return isAtomicRMWLegalIntTy(RMW->getType(), Allow64)
+ if (!Allow64) {
+ if (auto *IT = dyn_cast<IntegerType>(RMW->getType())) {
+ if (IT->getBitWidth() == 32)
+ return TargetLowering::AtomicExpansionKind::None;
+ }
+ return TargetLowering::AtomicExpansionKind::CmpXChg;
+ }
+ return isAtomicRMWLegalIntTy(RMW->getType())
? TargetLowering::AtomicExpansionKind::None
: TargetLowering::AtomicExpansionKind::CmpXChg;
}
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d05be8f95c618..65d30b823fc6f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -4305,6 +4305,7 @@ def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32 : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b5b3cc97569ed..94bb5da1345a4 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -728,6 +728,8 @@ bool isGenericAtomic(unsigned Opc) {
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX ||
Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB_CLAMP_U32 ||
+ Opc == AMDGPU::G_AMDGPU_BUFFER_ATOMIC_COND_SUB_U32 ||
Opc == AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
index 70bfb2ed70ebd..16f293e622a1e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.global.atomic.csub.ll
@@ -8,15 +8,21 @@ define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
; GFX10-LABEL: global_atomic_csub:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_atomic_csub:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: global_atomic_csub:
@@ -26,10 +32,14 @@ define i32 @global_atomic_csub(ptr addrspace(1) %ptr, i32 %data) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst
ret i32 %ret
}
@@ -39,8 +49,11 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_atomic_csub_offset:
@@ -49,8 +62,11 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: global_atomic_csub_offset:
@@ -60,11 +76,15 @@ define i32 @global_atomic_csub_offset(ptr addrspace(1) %ptr, i32 %data) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst
ret i32 %ret
}
@@ -72,15 +92,21 @@ define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
; GFX10-LABEL: global_atomic_csub_nortn:
; GFX10: ; %bb.0:
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_atomic_csub_nortn:
; GFX11: ; %bb.0:
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: global_atomic_csub_nortn:
@@ -90,10 +116,14 @@ define void @global_atomic_csub_nortn(ptr addrspace(1) %ptr, i32 %data) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %ptr, i32 %data)
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i32 %data seq_cst
ret void
}
@@ -103,8 +133,11 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-NEXT: s_waitcnt_vscnt null, 0x0
; GFX10-NEXT: global_atomic_csub v0, v[0:1], v2, off glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: global_atomic_csub_offset_nortn:
@@ -113,8 +146,11 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x1000, v0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
; GFX11-NEXT: global_atomic_csub_u32 v0, v[0:1], v2, off glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_setpc_b64 s[30:31]
;
; GFX12-LABEL: global_atomic_csub_offset_nortn:
@@ -124,11 +160,15 @@ define void @global_atomic_csub_offset_nortn(ptr addrspace(1) %ptr, i32 %data) {
; GFX12-NEXT: s_wait_samplecnt 0x0
; GFX12-NEXT: s_wait_bvhcnt 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v[0:1], v2, off offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: s_setpc_b64 s[30:31]
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst
ret void
}
@@ -143,6 +183,8 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: global_store_dword v[0:1], v0, off
; GFX10-NEXT: s_endpgm
;
@@ -155,6 +197,8 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1)
; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: global_store_b32 v[0:1], v0, off
; GFX11-NEXT: s_endpgm
;
@@ -163,12 +207,15 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset(ptr addrspace(1)
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: global_store_b32 v[0:1], v0, off
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst
store i32 %ret, ptr addrspace(1) poison
ret void
}
@@ -183,6 +230,9 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_mov_b32_e32 v0, s2
; GFX10-NEXT: global_atomic_csub v0, v1, v0, s[0:1] glc
+; GFX10-NEXT: s_waitcnt vmcnt(0)
+; GFX10-NEXT: buffer_gl1_inv
+; GFX10-NEXT: buffer_gl0_inv
; GFX10-NEXT: s_endpgm
;
; GFX11-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
@@ -193,6 +243,9 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s2
; GFX11-NEXT: global_atomic_csub_u32 v0, v1, v0, s[0:1] glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
; GFX11-NEXT: s_endpgm
;
; GFX12-LABEL: global_atomic_csub_sgpr_base_offset_nortn:
@@ -200,14 +253,16 @@ define amdgpu_kernel void @global_atomic_csub_sgpr_base_offset_nortn(ptr addrspa
; GFX12-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: global_atomic_sub_clamp_u32 v0, v1, v0, s[0:1] offset:4096 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
; GFX12-NEXT: s_endpgm
%gep = getelementptr i32, ptr addrspace(1) %ptr, i64 1024
- %ret = call i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) %gep, i32 %data)
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i32 %data seq_cst
ret void
}
-declare i32 @llvm.amdgcn.global.atomic.csub.p1(ptr addrspace(1) nocapture, i32) #1
-
attributes #0 = { nounwind willreturn }
attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll
index f3db599a06dcd..5122ddc94ce22 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_cond.ll
@@ -7,12 +7,11 @@ define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32(ptr %addr, i32 %in)
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
;
@@ -20,12 +19,11 @@ define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32(ptr %addr, i32 %in)
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
-; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
entry:
@@ -39,11 +37,10 @@ define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32_forced(ptr %addr, i3
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16 scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_endpgm
@@ -52,11 +49,10 @@ define amdgpu_kernel void @flat_atomic_usub_cond_no_rtn_u32_forced(ptr %addr, i3
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16 scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16 scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_endpgm
@@ -69,15 +65,14 @@ entry:
define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr %use) {
; GFX12-SDAG-LABEL: flat_atomic_usub_cond_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
; GFX12-SDAG-NEXT: global_wb scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v2, v0, v1, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-SDAG-NEXT: global_inv scope:SCOPE_SYS
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
@@ -90,11 +85,10 @@ define amdgpu_kernel void @flat_atomic_usub_cond_rtn_u32(ptr %addr, i32 %in, ptr
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-GISEL-NEXT: global_wb scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v2, v1, v0, s[0:1] offset:16 th:TH_ATOMIC_RETURN scope:SCOPE_SYS
; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX12-GISEL-NEXT: global_inv scope:SCOPE_SYS
; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll
index 2e788d8eca50e..a5e400c0f45b4 100644
--- a/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomicrmw_usub_sat.ll
@@ -1,12 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10-GISEL
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10-GISEL
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11-GISEL
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-GISEL
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10-SDAG
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1031 < %s | FileCheck %s -check-prefix=GFX10-SDAG
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11-SDAG
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-SDAG
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10-GISEL
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11-GISEL
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-GISEL
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefix=GFX10-SDAG
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefix=GFX11-SDAG
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12-SDAG
define i32 @global_atomic_usub_sat(ptr addrspace(1) %ptr, i32 %data) {
; GFX10-GISEL-LABEL: global_atomic_usub_sat:
@@ -483,5 +481,2302 @@ define amdgpu_kernel void @global_atomic_usub_sat_sgpr_base_offset_nortn(ptr add
ret void
}
+define i16 @global_atomic_usub_sat_16(ptr addrspace(1) %ptr, i16 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_16:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX10-GISEL-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX10-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-GISEL-NEXT: v_sub_nc_u16 v3, v4, v2 clamp
+; GFX10-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX10-GISEL-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-GISEL-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: s_cbranch_execnz .LBB6_1
+; GFX10-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_sub_nc_u16 v3.l, v4.l, v2.l clamp
+; GFX11-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB6_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_16:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX12-GISEL-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_sub_nc_u16 v3, v4, v2 clamp
+; GFX12-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB6_1
+; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_16:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX10-SDAG-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX10-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-SDAG-NEXT: v_sub_nc_u16 v3, v4, v2 clamp
+; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX10-SDAG-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-SDAG-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-SDAG-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: s_cbranch_execnz .LBB6_1
+; GFX10-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX11-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_sub_nc_u16 v3.l, v4.l, v2.l clamp
+; GFX11-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB6_1
+; GFX11-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_16:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX12-SDAG-NEXT: .LBB6_1: ; %atomicrmw.start
+; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_sub_nc_u16 v3, v4, v2 clamp
+; GFX12-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB6_1
+; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i16 %data syncscope("agent") seq_cst, align 4
+ ret i16 %ret
+}
+
+define i16 @global_atomic_usub_sat_offset_16(ptr addrspace(1) %ptr, i16 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_offset_16:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_add_co_u32 v3, vcc_lo, 0x800, v0
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX10-GISEL-NEXT: global_load_dword v0, v[3:4], off
+; GFX10-GISEL-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX10-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-GISEL-NEXT: v_sub_nc_u16 v0, v1, v2 clamp
+; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX10-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_cmpswap v0, v[3:4], v[0:1], off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-GISEL-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: s_cbranch_execnz .LBB7_1
+; GFX10-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_offset_16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v[0:1], off offset:2048
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_sub_nc_u16 v3.l, v4.l, v2.l clamp
+; GFX11-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2048 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_offset_16:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off offset:2048
+; GFX12-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX12-GISEL-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_sub_nc_u16 v3, v4, v2 clamp
+; GFX12-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_offset_16:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_add_co_u32 v3, vcc_lo, 0x800, v0
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v4, null, 0, v1, vcc_lo
+; GFX10-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX10-SDAG-NEXT: global_load_dword v0, v[3:4], off
+; GFX10-SDAG-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX10-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-SDAG-NEXT: v_sub_nc_u16 v0, v1, v2 clamp
+; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX10-SDAG-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_cmpswap v0, v[3:4], v[0:1], off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-SDAG-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-SDAG-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: s_cbranch_execnz .LBB7_1
+; GFX10-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_offset_16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v[0:1], off offset:2048
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX11-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_sub_nc_u16 v3.l, v4.l, v2.l clamp
+; GFX11-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2048 glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB7_1
+; GFX11-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_offset_16:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_load_b32 v3, v[0:1], off offset:2048
+; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX12-SDAG-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_sub_nc_u16 v3, v4, v2 clamp
+; GFX12-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB7_1
+; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i16, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i16 %data syncscope("agent") seq_cst, align 4
+ ret i16 %ret
+}
+
+define void @global_atomic_usub_sat_nortn_16(ptr addrspace(1) %ptr, i16 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_nortn_16:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX10-GISEL-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX10-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: v_sub_nc_u16 v3, v4, v2 clamp
+; GFX10-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX10-GISEL-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-GISEL-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: s_cbranch_execnz .LBB8_1
+; GFX10-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_nortn_16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_sub_nc_u16 v3.l, v4.l, v2.l clamp
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-GISEL-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_nortn_16:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX12-GISEL-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: v_sub_nc_u16 v3, v4, v2 clamp
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-GISEL-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_nortn_16:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX10-SDAG-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX10-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: v_sub_nc_u16 v3, v4, v2 clamp
+; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX10-SDAG-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-SDAG-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-SDAG-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: s_cbranch_execnz .LBB8_1
+; GFX10-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_nortn_16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX11-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_sub_nc_u16 v3.l, v4.l, v2.l clamp
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-SDAG-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB8_1
+; GFX11-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_nortn_16:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_load_b32 v4, v[0:1], off
+; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX12-SDAG-NEXT: .LBB8_1: ; %atomicrmw.start
+; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: v_sub_nc_u16 v3, v4, v2 clamp
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-SDAG-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB8_1
+; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i16 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define void @global_atomic_usub_sat_offset_nortn_16(ptr addrspace(1) %ptr, i16 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_offset_nortn_16:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX10-GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX10-GISEL-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-GISEL-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX10-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: v_sub_nc_u16 v3, v4, v2 clamp
+; GFX10-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX10-GISEL-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-GISEL-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: s_cbranch_execnz .LBB9_1
+; GFX10-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_offset_nortn_16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v4, v[0:1], off offset:2048
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_sub_nc_u16 v3.l, v4.l, v2.l clamp
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-GISEL-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2048 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_offset_nortn_16:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_load_b32 v4, v[0:1], off offset:2048
+; GFX12-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX12-GISEL-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: v_sub_nc_u16 v3, v4, v2 clamp
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-GISEL-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_offset_nortn_16:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_add_co_u32 v0, vcc_lo, 0x800, v0
+; GFX10-SDAG-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
+; GFX10-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX10-SDAG-NEXT: global_load_dword v4, v[0:1], off
+; GFX10-SDAG-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX10-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: v_sub_nc_u16 v3, v4, v2 clamp
+; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX10-SDAG-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-SDAG-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-SDAG-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: s_cbranch_execnz .LBB9_1
+; GFX10-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_offset_nortn_16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v4, v[0:1], off offset:2048
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX11-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_sub_nc_u16 v3.l, v4.l, v2.l clamp
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-SDAG-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2048 glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB9_1
+; GFX11-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_offset_nortn_16:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_load_b32 v4, v[0:1], off offset:2048
+; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX12-SDAG-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: v_sub_nc_u16 v3, v4, v2 clamp
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-SDAG-NEXT: v_and_or_b32 v3, 0xffff0000, v4, v3
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB9_1
+; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i16, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i16 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_sat_sgpr_base_offset_16(ptr addrspace(1) %ptr, i16 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_16:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0x800
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_load_dword s3, s[0:1], 0x800
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0
+; GFX10-GISEL-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX10-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-GISEL-NEXT: v_sub_nc_u16 v1, v2, s2 clamp
+; GFX10-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX10-GISEL-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX10-GISEL-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-GISEL-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX10-GISEL-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
+; GFX10-GISEL-NEXT: s_cbranch_execnz .LBB10_1
+; GFX10-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10-GISEL-NEXT: global_store_short v[0:1], v1, off
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x800
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0
+; GFX11-GISEL-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-GISEL-NEXT: v_sub_nc_u16 v1.l, v2.l, s2 clamp
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-GISEL-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-GISEL-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:2048 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX11-GISEL-NEXT: global_store_b16 v[0:1], v1, off
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_16:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x800
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT: s_mov_b32 s3, 0
+; GFX12-GISEL-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-GISEL-NEXT: v_sub_nc_u16 v1, v2, s2 clamp
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-GISEL-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-GISEL-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX12-GISEL-NEXT: global_store_b16 v[0:1], v1, off
+; GFX12-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_16:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: s_load_dword s3, s[0:1], 0x800
+; GFX10-SDAG-NEXT: s_add_u32 s0, s0, 0x800
+; GFX10-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT: s_mov_b32 s3, 0
+; GFX10-SDAG-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX10-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-SDAG-NEXT: v_sub_nc_u16 v1, v2, s2 clamp
+; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX10-SDAG-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX10-SDAG-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX10-SDAG-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
+; GFX10-SDAG-NEXT: s_cbranch_execnz .LBB10_1
+; GFX10-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX10-SDAG-NEXT: global_store_short v[0:1], v1, off
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x800
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX11-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-SDAG-NEXT: v_sub_nc_u16 v1.l, v2.l, s2 clamp
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX11-SDAG-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:2048 glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB10_1
+; GFX11-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX11-SDAG-NEXT: global_store_b16 v[0:1], v1, off
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_16:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x800
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: s_mov_b32 s3, 0
+; GFX12-SDAG-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-SDAG-NEXT: v_sub_nc_u16 v1, v2, s2 clamp
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-NEXT: v_and_or_b32 v1, 0xffff0000, v2, v1
+; GFX12-SDAG-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB10_1
+; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX12-SDAG-NEXT: global_store_b16 v[0:1], v1, off
+; GFX12-SDAG-NEXT: s_endpgm
+ %gep = getelementptr i16, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i16 %data syncscope("agent") seq_cst, align 4
+ store i16 %ret, ptr addrspace(1) undef
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_sat_sgpr_base_offset_nortn_16(ptr addrspace(1) %ptr, i16 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn_16:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0x800
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_load_dword s3, s[0:1], 0x800
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-GISEL-NEXT: s_mov_b32 s3, 0
+; GFX10-GISEL-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX10-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-GISEL-NEXT: v_sub_nc_u16 v0, v1, s2 clamp
+; GFX10-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX10-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX10-GISEL-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-GISEL-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX10-GISEL-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
+; GFX10-GISEL-NEXT: s_cbranch_execnz .LBB11_1
+; GFX10-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn_16:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x800
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX11-GISEL-NEXT: s_mov_b32 s3, 0
+; GFX11-GISEL-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_sub_nc_u16 v0.l, v1.l, s2 clamp
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX11-GISEL-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:2048 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-GISEL-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn_16:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x800
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3
+; GFX12-GISEL-NEXT: s_mov_b32 s3, 0
+; GFX12-GISEL-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_sub_nc_u16 v0, v1, s2 clamp
+; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX12-GISEL-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn_16:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: s_load_dword s3, s[0:1], 0x800
+; GFX10-SDAG-NEXT: s_add_u32 s0, s0, 0x800
+; GFX10-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX10-SDAG-NEXT: s_mov_b32 s3, 0
+; GFX10-SDAG-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX10-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-SDAG-NEXT: v_sub_nc_u16 v0, v1, s2 clamp
+; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX10-SDAG-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX10-SDAG-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-SDAG-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX10-SDAG-NEXT: s_andn2_b32 exec_lo, exec_lo, s3
+; GFX10-SDAG-NEXT: s_cbranch_execnz .LBB11_1
+; GFX10-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn_16:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x8
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x800
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-SDAG-NEXT: s_mov_b32 s3, 0
+; GFX11-SDAG-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX11-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_sub_nc_u16 v0.l, v1.l, s2 clamp
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX11-SDAG-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:2048 glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-SDAG-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB11_1
+; GFX11-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn_16:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x800
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: s_mov_b32 s3, 0
+; GFX12-SDAG-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_sub_nc_u16 v0, v1, s2 clamp
+; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_or_b32 v0, 0xffff0000, v1, v0
+; GFX12-SDAG-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB11_1
+; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-SDAG-NEXT: s_endpgm
+ %gep = getelementptr i16, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i16 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+
+define i8 @global_atomic_usub_sat_8(ptr addrspace(1) %ptr, i8 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_8:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-GISEL-NEXT: v_lshlrev_b16 v2, 8, v2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xff
+; GFX10-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX10-GISEL-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX10-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-GISEL-NEXT: v_lshlrev_b16 v3, 8, v6
+; GFX10-GISEL-NEXT: v_sub_nc_u16 v3, v3, v2 clamp
+; GFX10-GISEL-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT: v_and_or_b32 v5, 0xffffff00, v6, v3
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-GISEL-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: s_cbranch_execnz .LBB12_1
+; GFX10-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_8:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-GISEL-NEXT: v_lshlrev_b16 v2.l, 8, v2.l
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: .p2align 6
+; GFX11-GISEL-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b16 v2.h, 8, v4.l
+; GFX11-GISEL-NEXT: v_sub_nc_u16 v2.h, v2.h, v2.l clamp
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshrrev_b16 v3.l, 8, v2.h
+; GFX11-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_or_b32 v3, 0xffffff00, v4, v3
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_8:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-GISEL-NEXT: v_lshlrev_b16 v2, 8, v2
+; GFX12-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX12-GISEL-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_lshlrev_b16 v3, 8, v4
+; GFX12-GISEL-NEXT: v_sub_nc_u16 v3, v3, v2 clamp
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_lshrrev_b16 v3, 8, v3
+; GFX12-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_and_or_b32 v3, 0xffffff00, v4, v3
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_8:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX10-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX10-SDAG-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX10-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX10-SDAG-NEXT: v_sub_nc_u16 v3, v3, v2 clamp
+; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX10-SDAG-NEXT: v_and_or_b32 v3, 0xffffff00, v4, v3
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-SDAG-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-SDAG-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: s_cbranch_execnz .LBB12_1
+; GFX10-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_8:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v[0:1], off
+; GFX11-SDAG-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: .p2align 6
+; GFX11-SDAG-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX11-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b16 v2.h, 0xff, v4.l
+; GFX11-SDAG-NEXT: v_sub_nc_u16 v3.l, v2.h, v2.l clamp
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-SDAG-NEXT: v_and_or_b32 v3, 0xffffff00, v4, v3
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB12_1
+; GFX11-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_8:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX12-SDAG-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX12-SDAG-NEXT: v_sub_nc_u16 v3, v3, v2 clamp
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-SDAG-NEXT: v_and_or_b32 v3, 0xffffff00, v4, v3
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB12_1
+; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i8 %data syncscope("agent") seq_cst, align 4
+ ret i8 %ret
+}
+
+define i8 @global_atomic_usub_sat_offset_8(ptr addrspace(1) %ptr, i8 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_offset_8:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: global_load_dword v3, v[0:1], off offset:1024
+; GFX10-GISEL-NEXT: v_lshlrev_b16 v2, 8, v2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v4, 0xff
+; GFX10-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX10-GISEL-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX10-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v6, v3
+; GFX10-GISEL-NEXT: v_lshlrev_b16 v3, 8, v6
+; GFX10-GISEL-NEXT: v_sub_nc_u16 v3, v3, v2 clamp
+; GFX10-GISEL-NEXT: v_and_b32_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT: v_and_or_b32 v5, 0xffffff00, v6, v3
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_cmpswap v3, v[0:1], v[5:6], off offset:1024 glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v6
+; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-GISEL-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: s_cbranch_execnz .LBB13_1
+; GFX10-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_offset_8:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v3, v[0:1], off offset:1024
+; GFX11-GISEL-NEXT: v_lshlrev_b16 v2.l, 8, v2.l
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: .p2align 6
+; GFX11-GISEL-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b16 v2.h, 8, v4.l
+; GFX11-GISEL-NEXT: v_sub_nc_u16 v2.h, v2.h, v2.l clamp
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshrrev_b16 v3.l, 8, v2.h
+; GFX11-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_or_b32 v3, 0xffffff00, v4, v3
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_offset_8:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off offset:1024
+; GFX12-GISEL-NEXT: v_lshlrev_b16 v2, 8, v2
+; GFX12-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX12-GISEL-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_lshlrev_b16 v3, 8, v4
+; GFX12-GISEL-NEXT: v_sub_nc_u16 v3, v3, v2 clamp
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_lshrrev_b16 v3, 8, v3
+; GFX12-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_and_or_b32 v3, 0xffffff00, v4, v3
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_offset_8:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: global_load_dword v3, v[0:1], off offset:1024
+; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX10-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX10-SDAG-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX10-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX10-SDAG-NEXT: v_sub_nc_u16 v3, v3, v2 clamp
+; GFX10-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX10-SDAG-NEXT: v_and_or_b32 v3, 0xffffff00, v4, v3
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_cmpswap v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX10-SDAG-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-SDAG-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: s_cbranch_execnz .LBB13_1
+; GFX10-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, v3
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_offset_8:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v3, v[0:1], off offset:1024
+; GFX11-SDAG-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: .p2align 6
+; GFX11-SDAG-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX11-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b16 v2.h, 0xff, v4.l
+; GFX11-SDAG-NEXT: v_sub_nc_u16 v3.l, v2.h, v2.l clamp
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-SDAG-NEXT: v_and_or_b32 v3, 0xffffff00, v4, v3
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB13_1
+; GFX11-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: v_mov_b16_e32 v0.l, v3.l
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_offset_8:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_load_b32 v3, v[0:1], off offset:1024
+; GFX12-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX12-SDAG-NEXT: .LBB13_1: ; %atomicrmw.start
+; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_b32_e32 v3, 0xff, v4
+; GFX12-SDAG-NEXT: v_sub_nc_u16 v3, v3, v2 clamp
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX12-SDAG-NEXT: v_and_or_b32 v3, 0xffffff00, v4, v3
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB13_1
+; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, v3
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i8 %data syncscope("agent") seq_cst, align 4
+ ret i8 %ret
+}
+
+define void @global_atomic_usub_sat_nortn_8(ptr addrspace(1) %ptr, i8 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_nortn_8:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-GISEL-NEXT: v_lshlrev_b16 v4, 8, v2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0xff
+; GFX10-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX10-GISEL-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX10-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: v_lshlrev_b16 v2, 8, v3
+; GFX10-GISEL-NEXT: v_sub_nc_u16 v2, v2, v4 clamp
+; GFX10-GISEL-NEXT: v_and_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT: v_and_or_b32 v2, 0xffffff00, v3, v2
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-GISEL-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: s_cbranch_execnz .LBB14_1
+; GFX10-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_nortn_8:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-GISEL-NEXT: v_lshlrev_b16 v2.l, 8, v2.l
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: .p2align 6
+; GFX11-GISEL-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_lshlrev_b16 v2.h, 8, v4.l
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_sub_nc_u16 v2.h, v2.h, v2.l clamp
+; GFX11-GISEL-NEXT: v_lshrrev_b16 v3.l, 8, v2.h
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-GISEL-NEXT: v_and_or_b32 v3, 0xffffff00, v4, v3
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_nortn_8:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-GISEL-NEXT: v_lshlrev_b16 v4, 8, v2
+; GFX12-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX12-GISEL-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: v_lshlrev_b16 v2, 8, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_sub_nc_u16 v2, v2, v4 clamp
+; GFX12-GISEL-NEXT: v_lshrrev_b16 v2, 8, v2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX12-GISEL-NEXT: v_and_or_b32 v2, 0xffffff00, v3, v2
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_nortn_8:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: global_load_dword v3, v[0:1], off
+; GFX10-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v2
+; GFX10-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX10-SDAG-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX10-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX10-SDAG-NEXT: v_sub_nc_u16 v2, v2, v4 clamp
+; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX10-SDAG-NEXT: v_and_or_b32 v2, 0xffffff00, v3, v2
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-SDAG-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-SDAG-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: s_cbranch_execnz .LBB14_1
+; GFX10-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_nortn_8:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v4, v[0:1], off
+; GFX11-SDAG-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: .p2align 6
+; GFX11-SDAG-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX11-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_and_b16 v2.h, 0xff, v4.l
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_sub_nc_u16 v3.l, v2.h, v2.l clamp
+; GFX11-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_or_b32 v3, 0xffffff00, v4, v3
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB14_1
+; GFX11-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_nortn_8:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_load_b32 v3, v[0:1], off
+; GFX12-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v2
+; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX12-SDAG-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_sub_nc_u16 v2, v2, v4 clamp
+; GFX12-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_or_b32 v2, 0xffffff00, v3, v2
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB14_1
+; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %ret = atomicrmw usub_sat ptr addrspace(1) %ptr, i8 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define void @global_atomic_usub_sat_offset_nortn_8(ptr addrspace(1) %ptr, i8 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_offset_nortn_8:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-GISEL-NEXT: global_load_dword v3, v[0:1], off offset:1024
+; GFX10-GISEL-NEXT: v_lshlrev_b16 v4, 8, v2
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v5, 0xff
+; GFX10-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX10-GISEL-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX10-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: v_lshlrev_b16 v2, 8, v3
+; GFX10-GISEL-NEXT: v_sub_nc_u16 v2, v2, v4 clamp
+; GFX10-GISEL-NEXT: v_and_b32_sdwa v2, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT: v_and_or_b32 v2, 0xffffff00, v3, v2
+; GFX10-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-GISEL-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-GISEL-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-GISEL-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: s_cbranch_execnz .LBB15_1
+; GFX10-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_offset_nortn_8:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-GISEL-NEXT: global_load_b32 v4, v[0:1], off offset:1024
+; GFX11-GISEL-NEXT: v_lshlrev_b16 v2.l, 8, v2.l
+; GFX11-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX11-GISEL-NEXT: .p2align 6
+; GFX11-GISEL-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: v_lshlrev_b16 v2.h, 8, v4.l
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_sub_nc_u16 v2.h, v2.h, v2.l clamp
+; GFX11-GISEL-NEXT: v_lshrrev_b16 v3.l, 8, v2.h
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v3, 0xff, v3
+; GFX11-GISEL-NEXT: v_and_or_b32 v3, 0xffffff00, v4, v3
+; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-GISEL-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_offset_nortn_8:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT: s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT: s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: global_load_b32 v3, v[0:1], off offset:1024
+; GFX12-GISEL-NEXT: v_lshlrev_b16 v4, 8, v2
+; GFX12-GISEL-NEXT: s_mov_b32 s0, 0
+; GFX12-GISEL-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: v_lshlrev_b16 v2, 8, v3
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_sub_nc_u16 v2, v2, v4 clamp
+; GFX12-GISEL-NEXT: v_lshrrev_b16 v2, 8, v2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_and_b32_e32 v2, 0xff, v2
+; GFX12-GISEL-NEXT: v_and_or_b32 v2, 0xffffff00, v3, v2
+; GFX12-GISEL-NEXT: s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:1024 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_offset_nortn_8:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX10-SDAG-NEXT: global_load_dword v3, v[0:1], off offset:1024
+; GFX10-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v2
+; GFX10-SDAG-NEXT: s_mov_b32 s4, 0
+; GFX10-SDAG-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX10-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX10-SDAG-NEXT: v_sub_nc_u16 v2, v2, v4 clamp
+; GFX10-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX10-SDAG-NEXT: v_and_or_b32 v2, 0xffffff00, v3, v2
+; GFX10-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX10-SDAG-NEXT: global_atomic_cmpswap v2, v[0:1], v[2:3], off offset:1024 glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-SDAG-NEXT: s_or_b32 s4, vcc_lo, s4
+; GFX10-SDAG-NEXT: s_andn2_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: s_cbranch_execnz .LBB15_1
+; GFX10-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s4
+; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_offset_nortn_8:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-SDAG-NEXT: global_load_b32 v4, v[0:1], off offset:1024
+; GFX11-SDAG-NEXT: v_and_b16 v2.l, 0xff, v2.l
+; GFX11-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX11-SDAG-NEXT: .p2align 6
+; GFX11-SDAG-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX11-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: v_and_b16 v2.h, 0xff, v4.l
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_sub_nc_u16 v3.l, v2.h, v2.l clamp
+; GFX11-SDAG-NEXT: v_and_b32_e32 v3, 0xffff, v3
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_or_b32 v3, 0xffffff00, v4, v3
+; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-SDAG-NEXT: global_atomic_cmpswap_b32 v3, v[0:1], v[3:4], off offset:1024 glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v3, v4
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX11-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB15_1
+; GFX11-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX11-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_offset_nortn_8:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT: s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT: s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: global_load_b32 v3, v[0:1], off offset:1024
+; GFX12-SDAG-NEXT: v_and_b32_e32 v4, 0xff, v2
+; GFX12-SDAG-NEXT: s_mov_b32 s0, 0
+; GFX12-SDAG-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: v_and_b32_e32 v2, 0xff, v3
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_sub_nc_u16 v2, v2, v4 clamp
+; GFX12-SDAG-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_or_b32 v2, 0xffffff00, v3, v2
+; GFX12-SDAG-NEXT: s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], v[2:3], off offset:1024 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB15_1
+; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s0
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i8 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_sat_sgpr_base_offset_8(ptr addrspace(1) %ptr, i8 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_8:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-GISEL-NEXT: s_load_dword s3, s[8:9], 0x8
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, 0xff
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x400
+; GFX10-GISEL-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX10-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX10-GISEL-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX10-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, v2
+; GFX10-GISEL-NEXT: v_lshlrev_b16 v2, 8, v3
+; GFX10-GISEL-NEXT: v_sub_nc_u16 v2, v2, s3 clamp
+; GFX10-GISEL-NEXT: v_and_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT: v_and_or_b32 v2, 0xffffff00, v3, v2
+; GFX10-GISEL-NEXT: global_atomic_cmpswap v2, v0, v[2:3], s[0:1] offset:1024 glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX10-GISEL-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX10-GISEL-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX10-GISEL-NEXT: s_cbranch_execnz .LBB16_1
+; GFX10-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10-GISEL-NEXT: global_store_byte v[0:1], v2, off
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_8:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-GISEL-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x400
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX11-GISEL-NEXT: .p2align 6
+; GFX11-GISEL-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-GISEL-NEXT: v_lshlrev_b16 v1.l, 8, v2.l
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_sub_nc_u16 v1.l, v1.l, s3 clamp
+; GFX11-GISEL-NEXT: v_lshrrev_b16 v1.l, 8, v1.l
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX11-GISEL-NEXT: v_and_or_b32 v1, 0xffffff00, v2, v1
+; GFX11-GISEL-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:1024 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-GISEL-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11-GISEL-NEXT: global_store_b8 v[0:1], v1, off
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_8:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x400
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-GISEL-NEXT: s_lshl_b32 s2, s2, 8
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-GISEL-NEXT: s_mov_b32 s3, 0
+; GFX12-GISEL-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-GISEL-NEXT: v_lshlrev_b16 v1, 8, v2
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_sub_nc_u16 v1, v1, s2 clamp
+; GFX12-GISEL-NEXT: v_lshrrev_b16 v1, 8, v1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_and_b32_e32 v1, 0xff, v1
+; GFX12-GISEL-NEXT: v_and_or_b32 v1, 0xffffff00, v2, v1
+; GFX12-GISEL-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:1024 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB16_1
+; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX12-GISEL-NEXT: global_store_b8 v[0:1], v1, off
+; GFX12-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_8:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-SDAG-NEXT: s_load_dword s3, s[8:9], 0x8
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x400
+; GFX10-SDAG-NEXT: s_and_b32 s3, s3, 0xff
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT: s_mov_b32 s2, 0
+; GFX10-SDAG-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX10-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX10-SDAG-NEXT: v_sub_nc_u16 v1, v1, s3 clamp
+; GFX10-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX10-SDAG-NEXT: v_and_or_b32 v1, 0xffffff00, v2, v1
+; GFX10-SDAG-NEXT: global_atomic_cmpswap v1, v0, v[1:2], s[0:1] offset:1024 glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX10-SDAG-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX10-SDAG-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX10-SDAG-NEXT: s_cbranch_execnz .LBB16_1
+; GFX10-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX10-SDAG-NEXT: global_store_byte v[0:1], v1, off
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_8:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-SDAG-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x400
+; GFX11-SDAG-NEXT: s_and_b32 s3, s3, 0xff
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s2, 0
+; GFX11-SDAG-NEXT: .p2align 6
+; GFX11-SDAG-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX11-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX11-SDAG-NEXT: v_and_b16 v1.l, 0xff, v2.l
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_sub_nc_u16 v1.l, v1.l, s3 clamp
+; GFX11-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_or_b32 v1, 0xffffff00, v2, v1
+; GFX11-SDAG-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:1024 glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX11-SDAG-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB16_1
+; GFX11-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX11-SDAG-NEXT: global_store_b8 v[0:1], v1, off
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_8:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x400
+; GFX12-SDAG-NEXT: s_and_b32 s2, s2, 0xff
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: s_mov_b32 s3, 0
+; GFX12-SDAG-NEXT: .LBB16_1: ; %atomicrmw.start
+; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0xff, v2
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_sub_nc_u16 v1, v1, s2 clamp
+; GFX12-SDAG-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_or_b32 v1, 0xffffff00, v2, v1
+; GFX12-SDAG-NEXT: global_atomic_cmpswap_b32 v1, v0, v[1:2], s[0:1] offset:1024 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v2
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB16_1
+; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-SDAG-NEXT: s_or_b32 exec_lo, exec_lo, s3
+; GFX12-SDAG-NEXT: global_store_b8 v[0:1], v1, off
+; GFX12-SDAG-NEXT: s_endpgm
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i8 %data syncscope("agent") seq_cst, align 4
+ store i8 %ret, ptr addrspace(1) undef
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_usub_sat_sgpr_base_offset_nortn_8(ptr addrspace(1) %ptr, i8 %data) {
+; GFX10-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn_8:
+; GFX10-GISEL: ; %bb.0:
+; GFX10-GISEL-NEXT: s_clause 0x1
+; GFX10-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-GISEL-NEXT: s_load_dword s3, s[8:9], 0x8
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v3, 0xff
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: s_load_dword s2, s[0:1], 0x400
+; GFX10-GISEL-NEXT: s_lshl_b32 s3, s3, 8
+; GFX10-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX10-GISEL-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX10-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-GISEL-NEXT: v_lshlrev_b16 v0, 8, v1
+; GFX10-GISEL-NEXT: v_sub_nc_u16 v0, v0, s3 clamp
+; GFX10-GISEL-NEXT: v_and_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX10-GISEL-NEXT: v_and_or_b32 v0, 0xffffff00, v1, v0
+; GFX10-GISEL-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:1024 glc
+; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX10-GISEL-NEXT: buffer_gl1_inv
+; GFX10-GISEL-NEXT: buffer_gl0_inv
+; GFX10-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-GISEL-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX10-GISEL-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX10-GISEL-NEXT: s_cbranch_execnz .LBB17_1
+; GFX10-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-GISEL-NEXT: s_endpgm
+;
+; GFX11-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn_8:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-GISEL-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_load_b32 s2, s[0:1], 0x400
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-GISEL-NEXT: s_lshl_b32 s3, s3, 8
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX11-GISEL-NEXT: .p2align 6
+; GFX11-GISEL-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX11-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshlrev_b16 v0.l, 8, v1.l
+; GFX11-GISEL-NEXT: v_sub_nc_u16 v0.l, v0.l, s3 clamp
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_lshrrev_b16 v0.l, 8, v0.l
+; GFX11-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-GISEL-NEXT: v_and_or_b32 v0, 0xffffff00, v1, v0
+; GFX11-GISEL-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:1024 glc
+; GFX11-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX11-GISEL-NEXT: buffer_gl1_inv
+; GFX11-GISEL-NEXT: buffer_gl0_inv
+; GFX11-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-GISEL-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX11-GISEL-NEXT: s_cbranch_execnz .LBB17_1
+; GFX11-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX12-GISEL-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn_8:
+; GFX12-GISEL: ; %bb.0:
+; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: s_load_b32 s3, s[0:1], 0x400
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT: s_lshl_b32 s2, s2, 8
+; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-GISEL-NEXT: s_mov_b32 s3, 0
+; GFX12-GISEL-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX12-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_lshlrev_b16 v0, 8, v1
+; GFX12-GISEL-NEXT: v_sub_nc_u16 v0, v0, s2 clamp
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_lshrrev_b16 v0, 8, v0
+; GFX12-GISEL-NEXT: v_and_b32_e32 v0, 0xff, v0
+; GFX12-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT: v_and_or_b32 v0, 0xffffff00, v1, v0
+; GFX12-GISEL-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:1024 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-GISEL-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX12-GISEL-NEXT: s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX12-GISEL-NEXT: s_cbranch_execnz .LBB17_1
+; GFX12-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-GISEL-NEXT: s_endpgm
+;
+; GFX10-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn_8:
+; GFX10-SDAG: ; %bb.0:
+; GFX10-SDAG-NEXT: s_clause 0x1
+; GFX10-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX10-SDAG-NEXT: s_load_dword s3, s[8:9], 0x8
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: s_load_dword s2, s[0:1], 0x400
+; GFX10-SDAG-NEXT: s_and_b32 s3, s3, 0xff
+; GFX10-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX10-SDAG-NEXT: s_mov_b32 s2, 0
+; GFX10-SDAG-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX10-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v1
+; GFX10-SDAG-NEXT: v_sub_nc_u16 v0, v0, s3 clamp
+; GFX10-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX10-SDAG-NEXT: v_and_or_b32 v0, 0xffffff00, v1, v0
+; GFX10-SDAG-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:1024 glc
+; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX10-SDAG-NEXT: buffer_gl1_inv
+; GFX10-SDAG-NEXT: buffer_gl0_inv
+; GFX10-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX10-SDAG-NEXT: v_mov_b32_e32 v1, v0
+; GFX10-SDAG-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX10-SDAG-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX10-SDAG-NEXT: s_cbranch_execnz .LBB17_1
+; GFX10-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX10-SDAG-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn_8:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x0
+; GFX11-SDAG-NEXT: s_load_b32 s3, s[4:5], 0x8
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_load_b32 s2, s[0:1], 0x400
+; GFX11-SDAG-NEXT: s_and_b32 s3, s3, 0xff
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2
+; GFX11-SDAG-NEXT: s_mov_b32 s2, 0
+; GFX11-SDAG-NEXT: .p2align 6
+; GFX11-SDAG-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX11-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b16 v0.l, 0xff, v1.l
+; GFX11-SDAG-NEXT: v_sub_nc_u16 v0.l, v0.l, s3 clamp
+; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX11-SDAG-NEXT: v_and_or_b32 v0, 0xffffff00, v1, v0
+; GFX11-SDAG-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:1024 glc
+; GFX11-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX11-SDAG-NEXT: buffer_gl1_inv
+; GFX11-SDAG-NEXT: buffer_gl0_inv
+; GFX11-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, v0
+; GFX11-SDAG-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX11-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX11-SDAG-NEXT: s_cbranch_execnz .LBB17_1
+; GFX11-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX11-SDAG-NEXT: s_endpgm
+;
+; GFX12-SDAG-LABEL: global_atomic_usub_sat_sgpr_base_offset_nortn_8:
+; GFX12-SDAG: ; %bb.0:
+; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: s_load_b32 s3, s[0:1], 0x400
+; GFX12-SDAG-NEXT: s_and_b32 s2, s2, 0xff
+; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s3
+; GFX12-SDAG-NEXT: s_mov_b32 s3, 0
+; GFX12-SDAG-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX12-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0xff, v1
+; GFX12-SDAG-NEXT: v_sub_nc_u16 v0, v0, s2 clamp
+; GFX12-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX12-SDAG-NEXT: v_and_or_b32 v0, 0xffffff00, v1, v0
+; GFX12-SDAG-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:1024 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, v0
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_or_b32 s3, vcc_lo, s3
+; GFX12-SDAG-NEXT: s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT: s_and_not1_b32 exec_lo, exec_lo, s3
+; GFX12-SDAG-NEXT: s_cbranch_execnz .LBB17_1
+; GFX12-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX12-SDAG-NEXT: s_endpgm
+ %gep = getelementptr i8, ptr addrspace(1) %ptr, i64 1024
+ %ret = atomicrmw usub_sat ptr addrspace(1) %gep, i8 %data syncscope("agent") seq_cst, align 4
+ ret void
+}
+
attributes #0 = { nounwind willreturn }
attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
index 887f489d504f2..0dc2b95799cd8 100644
--- a/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomics_cond_sub.ll
@@ -13,7 +13,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) {
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: flat_atomic_cond_sub_no_rtn_u32:
@@ -22,7 +22,7 @@ define amdgpu_kernel void @flat_atomic_cond_sub_no_rtn_u32(ptr %addr, i32 %in) {
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr %addr, i32 -4
@@ -94,16 +94,18 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32(ptr addrspace(1) %a
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v0, v1, s[0:1] offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:-16 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
@@ -116,16 +118,18 @@ define amdgpu_kernel void @global_atomic_cond_sub_no_rtn_u32_forced(ptr addrspac
; GFX12-SDAG: ; %bb.0: ; %entry
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v0, v1, s[0:1] offset:-16
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_atomic_cond_sub_no_rtn_u32_forced:
; GFX12-GISEL: ; %bb.0: ; %entry
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v1, v0, s[0:1] offset:-16
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v[0:1], v2 offset:-16
; GFX12-GISEL-NEXT: s_endpgm
entry:
%gep = getelementptr i32, ptr addrspace(1) %addr, i32 -4
@@ -136,14 +140,16 @@ entry:
define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr, i32 %in, ptr addrspace(1) %use) {
; GFX12-SDAG-LABEL: global_atomic_cond_sub_rtn_u32:
; GFX12-SDAG: ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT: s_clause 0x1
; GFX12-SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 0
; GFX12-SDAG-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-SDAG-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, s2
-; GFX12-SDAG-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
+; GFX12-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT: global_store_b32 v0, v1, s[4:5]
+; GFX12-SDAG-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-SDAG-NEXT: s_endpgm
;
; GFX12-GISEL-LABEL: global_atomic_cond_sub_rtn_u32:
@@ -152,8 +158,10 @@ define amdgpu_kernel void @global_atomic_cond_sub_rtn_u32(ptr addrspace(1) %addr
; GFX12-GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24
; GFX12-GISEL-NEXT: s_load_b64 s[4:5], s[4:5], 0x34
; GFX12-GISEL-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
-; GFX12-GISEL-NEXT: global_atomic_cond_sub_u32 v0, v1, v0, s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX12-GISEL-NEXT: global_store_b32 v1, v0, s[4:5]
; GFX12-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll
new file mode 100644
index 0000000000000..f5ed60abfdf9e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_cond.ll
@@ -0,0 +1,115 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+
+; --------------------------------------------------------------------
+; i32
+; --------------------------------------------------------------------
+
+define i32 @buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, i32 %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s16
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw usub_sat ptr addrspace(7) %gep, i32 %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret i32 %result
+}
+
+define void @buffer_fat_ptr_agent_atomic_usub_sat_noret_u32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, i32 %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_usub_sat_noret_u32__offset__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s16
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(7) %ptr, i32 256
+ %unused = atomicrmw usub_sat ptr addrspace(7) %gep, i32 %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define i32 @buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, i32 %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_remote_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s16
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw usub_sat ptr addrspace(7) %gep, i32 %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+ ret i32 %result
+}
+
+define i32 @buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, i32 %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s16
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw usub_sat ptr addrspace(7) %gep, i32 %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+ ret i32 %result
+}
+
+; --------------------------------------------------------------------
+; misc
+; --------------------------------------------------------------------
+
+define i32 @buffer_fat_ptr_system_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, i32 %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_system_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s16
+; GFX12-NEXT: global_wb scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_SYS
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw usub_sat ptr addrspace(7) %gep, i32 %val seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret i32 %result
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_sat.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_sat.ll
new file mode 100644
index 0000000000000..8340cba2b0447
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointer-atomicrmw-usub_sat.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+
+; --------------------------------------------------------------------
+; i32
+; --------------------------------------------------------------------
+
+define i32 @buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, i32 %val) #0 {
+; GFX12-LABEL: buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine_grained_memory:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT: s_wait_expcnt 0x0
+; GFX12-NEXT: s_wait_samplecnt 0x0
+; GFX12-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: v_mov_b32_e32 v1, s16
+; GFX12-NEXT: s_wait_storecnt 0x0
+; GFX12-NEXT: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null offen offset:1024 th:TH_ATOMIC_RETURN
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_inv scope:SCOPE_DEV
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine_grained_memory:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v1, s16
+; GFX11-NEXT: s_waitcnt_vscnt null, 0x0
+; GFX11-NEXT: buffer_atomic_csub_u32 v0, v1, s[0:3], 0 offen offset:1024 glc
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: buffer_gl1_inv
+; GFX11-NEXT: buffer_gl0_inv
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %gep = getelementptr i32, ptr addrspace(7) %ptr, i32 256
+ %result = atomicrmw usub_sat ptr addrspace(7) %gep, i32 %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+ ret i32 %result
+}
+
+;define void @buffer_fat_ptr_agent_atomic_usub_sat_noret_u32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, i32 %val) #0 {
+; %gep = getelementptr i32, ptr addrspace(7) %ptr, i32 256
+; %unused = atomicrmw usub_sat ptr addrspace(7) %gep, i32 %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+; ret void
+;}
+
+;define i32 @buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, i32 %val) #0 {
+; %gep = getelementptr i32, ptr addrspace(7) %ptr, i32 256
+; %result = atomicrmw usub_sat ptr addrspace(7) %gep, i32 %val syncscope("agent") seq_cst, !amdgpu.no.remote.memory !0
+; ret i32 %result
+;}
+
+;define i32 @buffer_fat_ptr_agent_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory(ptr addrspace(7) inreg %ptr, i32 %val) #0 {
+; %gep = getelementptr i32, ptr addrspace(7) %ptr, i32 256
+; %result = atomicrmw usub_sat ptr addrspace(7) %gep, i32 %val syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0, !amdgpu.no.remote.memory !0
+; ret i32 %result
+;}
+
+; --------------------------------------------------------------------
+; misc
+; --------------------------------------------------------------------
+
+;define i32 @buffer_fat_ptr_system_atomic_usub_sat_ret_u32__offset__amdgpu_no_fine_grained_memory(ptr addrspace(7) inreg %ptr, i32 %val) #0 {
+; %gep = getelementptr i32, ptr addrspace(7) %ptr, i32 256
+; %result = atomicrmw usub_sat ptr addrspace(7) %gep, i32 %val seq_cst, !amdgpu.no.fine.grained.memory !0
+; ret i32 %result
+;}
+
+attributes #0 = { nounwind }
+
+!0 = !{}
diff --git a/llvm/test/CodeGen/AMDGPU/test_isel_single_lane.ll b/llvm/test/CodeGen/AMDGPU/test_isel_single_lane.ll
index 726e35d6651d0..46c3b3faf4f17 100644
--- a/llvm/test/CodeGen/AMDGPU/test_isel_single_lane.ll
+++ b/llvm/test/CodeGen/AMDGPU/test_isel_single_lane.ll
@@ -10,19 +10,23 @@ define amdgpu_kernel void @test_isel_single_lane(ptr addrspace(1) %in, ptr addrs
; GCN-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
; GCN-NEXT: s_wait_kmcnt 0x0
; GCN-NEXT: s_load_b32 s4, s[0:1], 0x58
+; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
; GCN-NEXT: s_wait_kmcnt 0x0
-; GCN-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
-; GCN-NEXT: global_atomic_cond_sub_u32 v1, v0, v1, s[0:1] offset:16 th:TH_ATOMIC_RETURN
+; GCN-NEXT: v_mov_b32_e32 v2, s4
+; GCN-NEXT: flat_atomic_cond_sub_u32 v0, v[0:1], v2 offset:16 th:TH_ATOMIC_RETURN
; GCN-NEXT: s_wait_loadcnt 0x0
-; GCN-NEXT: v_readfirstlane_b32 s0, v1
+; GCN-NEXT: v_readfirstlane_b32 s0, v0
+; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: s_addk_co_i32 s0, 0xf4
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_lshl_b32 s1, s0, 4
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_mul_i32 s0, s0, s1
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_lshl_b32 s0, s0, 12
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: s_sub_co_i32 s0, s1, s0
-; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-NEXT: s_wait_alu 0xfffe
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: global_store_b32 v0, v1, s[2:3]
; GCN-NEXT: s_endpgm
>From 49d4bea61cbd6fe3100f50629738159d34c84d2a Mon Sep 17 00:00:00 2001
From: Andrew Jenner <Andrew.Jenner at amd.com>
Date: Thu, 24 Jul 2025 10:19:34 -0400
Subject: [PATCH 4/4] Fix formatting.
---
llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
index b91200df0da6f..cb0ca3a6df104 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerBufferFatPointers.cpp
@@ -1773,8 +1773,9 @@ Value *SplitPtrStructs::handleMemoryInst(Instruction *I, Value *Arg, Value *Ptr,
break;
case AtomicRMWInst::UIncWrap:
case AtomicRMWInst::UDecWrap:
- reportFatalUsageError("wrapping increment/decrement not supported for "
- "buffer resources and should've been expanded away");
+ reportFatalUsageError(
+ "wrapping increment/decrement not supported for "
+ "buffer resources and should've been expanded away");
break;
case AtomicRMWInst::BAD_BINOP:
llvm_unreachable("Not sure how we got a bad binop");
More information about the llvm-commits
mailing list