[llvm] AMDGPU: Fix mis-selecting saddr flat atomics on gfx9 (PR #156860)
Matt Arsenault via llvm-commits
llvm-commits at lists.llvm.org
Thu Sep 4 07:04:54 PDT 2025
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/156860
>From 522784faba184a61d218789e466ab8cfc07823ab Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Thu, 4 Sep 2025 16:45:42 +0900
Subject: [PATCH] AMDGPU: Fix mis-selecting saddr flat atomics on gfx9
This would select the pseudo and then crash when the MC instruction
was used. I believe this has been broken since 9912ccb0b4d17
---
llvm/lib/Target/AMDGPU/AMDGPU.td | 18 +-
llvm/lib/Target/AMDGPU/FLATInstructions.td | 139 +-
.../test/CodeGen/AMDGPU/flat-saddr-atomics.ll | 7751 ++++++++++++++++-
3 files changed, 7832 insertions(+), 76 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 0e0b84f7e3374..a366db1c580ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -68,13 +68,15 @@ def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets",
def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts",
"FlatGlobalInsts",
"true",
- "Have global_* flat memory instructions"
+ "Have global_* flat memory instructions",
+ [FeatureFlatAddressSpace]
>;
def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts",
"FlatScratchInsts",
"true",
- "Have scratch_* flat memory instructions"
+ "Have scratch_* flat memory instructions",
+ [FeatureFlatAddressSpace]
>;
def FeatureScalarFlatScratchInsts : SubtargetFeature<"scalar-flat-scratch-insts",
@@ -92,7 +94,8 @@ def FeatureEnableFlatScratch : SubtargetFeature<"enable-flat-scratch",
def FeatureFlatGVSMode : SubtargetFeature<"flat-gvs-mode",
"FlatGVSMode",
"true",
- "Have GVS addressing mode with flat_* instructions"
+ "Have GVS addressing mode with flat_* instructions",
+ [FeatureFlatAddressSpace]
>;
def FeatureAddNoCarryInsts : SubtargetFeature<"add-no-carry-insts",
@@ -934,13 +937,15 @@ def FeatureAtomicFMinFMaxF64GlobalInsts : SubtargetFeature<"atomic-fmin-fmax-glo
def FeatureAtomicFMinFMaxF32FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f32",
"HasAtomicFMinFMaxF32FlatInsts",
"true",
- "Has flat memory instructions for atomicrmw fmin/fmax for float"
+ "Has flat memory instructions for atomicrmw fmin/fmax for float",
+ [FeatureFlatAddressSpace]
>;
def FeatureAtomicFMinFMaxF64FlatInsts : SubtargetFeature<"atomic-fmin-fmax-flat-f64",
"HasAtomicFMinFMaxF64FlatInsts",
"true",
- "Has flat memory instructions for atomicrmw fmin/fmax for double"
+ "Has flat memory instructions for atomicrmw fmin/fmax for double",
+ [FeatureFlatAddressSpace]
>;
def FeatureAtomicFaddNoRtnInsts : SubtargetFeature<"atomic-fadd-no-rtn-insts",
@@ -992,7 +997,8 @@ def FeatureFlatAtomicFaddF32Inst
: SubtargetFeature<"flat-atomic-fadd-f32-inst",
"HasFlatAtomicFaddF32Inst",
"true",
- "Has flat_atomic_add_f32 instruction"
+ "Has flat_atomic_add_f32 instruction",
+ [FeatureFlatAddressSpace]
>;
def FeatureFlatBufferGlobalAtomicFaddF64Inst
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 1617f7954a5ee..0ac5f3d50f1b5 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -297,7 +297,7 @@ multiclass FLAT_Flat_Store_Pseudo_t16<string opName> {
multiclass FLAT_Global_Load_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32,
bit HasTiedInput = 0> {
- let is_flat_global = 1 in {
+ let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1>,
GlobalSaddrTable<0, opName>;
def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
@@ -347,7 +347,7 @@ multiclass FLAT_Global_Load_AddTid_Pseudo<string opName, RegisterOperand regClas
}
multiclass FLAT_Global_Store_Pseudo<string opName, RegisterOperand regClass = AVLdSt_32> {
- let is_flat_global = 1 in {
+ let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
def "" : FLAT_Store_Pseudo<opName, regClass, 1>,
GlobalSaddrTable<0, opName>;
def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
@@ -1043,8 +1043,12 @@ let SubtargetPredicate = HasFlatAtomicFaddF32Inst in {
let SubtargetPredicate = isGFX12Plus in {
defm FLAT_ATOMIC_CSUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_csub_u32", VGPROp_32, i32>;
- defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo <"flat_atomic_cond_sub_u32", VGPROp_32, i32>;
-} // End SubtargetPredicate = isGFX12Plus
+ defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo_RTN<"flat_atomic_cond_sub_u32", VGPROp_32, i32>;
+}
+
+let SubtargetPredicate = HasAtomicCSubNoRtnInsts in {
+ defm FLAT_ATOMIC_COND_SUB_U32 : FLAT_Atomic_Pseudo_NO_RTN<"flat_atomic_cond_sub_u32", VGPROp_32, i32>;
+}
defm GLOBAL_LOAD_UBYTE : FLAT_Global_Load_Pseudo <"global_load_ubyte">;
defm GLOBAL_LOAD_SBYTE : FLAT_Global_Load_Pseudo <"global_load_sbyte">;
@@ -1296,19 +1300,19 @@ let SubtargetPredicate = isGFX10Plus in {
FLAT_Global_Atomic_Pseudo<"global_atomic_fcmpswap_x2", AVLdSt_64, f64, v2f64, AVLdSt_128>;
} // End SubtargetPredicate = isGFX10Plus
-let OtherPredicates = [HasAtomicFaddNoRtnInsts] in
+let SubtargetPredicate = HasAtomicFaddNoRtnInsts in
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_NO_RTN <
"global_atomic_add_f32", AVLdSt_32, f32
>;
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16NoRtnInsts in
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_NO_RTN <
"global_atomic_pk_add_f16", AVLdSt_32, v2f16
>;
-let OtherPredicates = [HasAtomicFaddRtnInsts] in
+let SubtargetPredicate = HasAtomicFaddRtnInsts in
defm GLOBAL_ATOMIC_ADD_F32 : FLAT_Global_Atomic_Pseudo_RTN <
"global_atomic_add_f32", AVLdSt_32, f32
>;
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in
+let SubtargetPredicate = HasAtomicBufferGlobalPkAddF16Insts in
defm GLOBAL_ATOMIC_PK_ADD_F16 : FLAT_Global_Atomic_Pseudo_RTN <
"global_atomic_pk_add_f16", AVLdSt_32, v2f16
>;
@@ -1442,8 +1446,10 @@ class FlatStoreSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
class FlatAtomicSaddrPat <FLAT_Pseudo inst, SDPatternOperator node, ComplexPattern pat,
ValueType vt, ValueType data_vt = vt> : GCNPat <
(vt (node (pat (i64 SReg_64:$saddr), (i32 VGPR_32:$voffset), i32:$offset, CPol:$cpol), data_vt:$data)),
- (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol)
->;
+ (inst $voffset, getVregSrcForVT<data_vt>.ret:$data, $saddr, $offset, $cpol)> {
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
+}
class GlobalAtomicNoRtnSaddrPat <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> : GCNPat <
@@ -1469,19 +1475,24 @@ class FlatStoreSignedAtomicPat <FLAT_Pseudo inst, SDPatternOperator node,
(inst $vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
>;
-multiclass FlatAtomicNoRtnPatBase <string inst, string node, ValueType vt,
+multiclass FlatAtomicNoRtnPatBase <string base_inst_name, string node, ValueType vt,
ValueType data_vt = vt> {
-
+ defvar inst = !cast<FLAT_Pseudo>(base_inst_name);
+ defvar inst_saddr = !cast<FLAT_Pseudo>(inst#"_SADDR");
defvar noRtnNode = !cast<PatFrags>(node);
let AddedComplexity = 1 in
def : GCNPat <(vt (noRtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
- (!cast<FLAT_Pseudo>(inst) VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+ (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
+ }
- def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR"), !cast<SDPatternOperator>(node),
+ def : FlatAtomicSaddrPat<inst_saddr, !cast<SDPatternOperator>(node),
GlobalSAddr, vt, data_vt> {
let AddedComplexity = 9;
- let SubtargetPredicate = HasFlatGVSMode;
+ let SubtargetPredicate = inst_saddr.SubtargetPredicate;
+ let OtherPredicates = inst_saddr.OtherPredicates;
}
}
@@ -1494,17 +1505,22 @@ multiclass FlatAtomicNoRtnPat <string inst, string node, ValueType vt,
FlatAtomicNoRtnPatBase<inst, node # "_noret" # !if(isIntr, "", "_"#vt), vt, data_vt>;
-multiclass FlatAtomicRtnPatBase <string inst, string node, ValueType vt,
+multiclass FlatAtomicRtnPatBase <string inst_name, string node, ValueType vt,
ValueType data_vt = vt> {
-
+ defvar inst = !cast<FLAT_Pseudo>(inst_name#"_RTN");
+ defvar inst_saddr = !cast<FLAT_Pseudo>(inst_name#"_SADDR_RTN");
defvar rtnNode = !cast<SDPatternOperator>(node);
def : GCNPat <(vt (rtnNode (FlatOffset i64:$vaddr, i32:$offset), data_vt:$data)),
- (!cast<FLAT_Pseudo>(inst#"_RTN") VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)>;
+ (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
+ }
- def : FlatAtomicSaddrPat<!cast<FLAT_Pseudo>(inst#"_SADDR_RTN"), rtnNode, GlobalSAddrGLC, vt, data_vt> {
+ def : FlatAtomicSaddrPat<inst_saddr, rtnNode, GlobalSAddrGLC, vt, data_vt> {
let AddedComplexity = 8;
- let SubtargetPredicate = HasFlatGVSMode;
+ let SubtargetPredicate = inst_saddr.SubtargetPredicate;
+ let OtherPredicates = inst_saddr.OtherPredicates;
}
}
@@ -1540,8 +1556,10 @@ multiclass FlatAtomicIntrPat <string inst, string node, ValueType vt,
class FlatSignedAtomicPatBase <FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt, ValueType data_vt = vt> : GCNPat <
(vt (node (GlobalOffset i64:$vaddr, i32:$offset), data_vt:$data)),
- (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)
->;
+ (inst VReg_64:$vaddr, getVregSrcForVT<data_vt>.ret:$data, $offset)> {
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
+}
multiclass FlatSignedAtomicPat <string inst, string node, ValueType vt,
ValueType data_vt = vt, int complexity = 0,
@@ -1651,30 +1669,42 @@ multiclass GlobalStoreLDSPats<FLAT_Pseudo inst, SDPatternOperator node> {
multiclass GlobalFLATLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat <inst, node, vt> {
let AddedComplexity = 10;
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
}
def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
}
}
multiclass GlobalFLATLoadPats_M0<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat_M0 <inst, node, vt> {
let AddedComplexity = 10;
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
}
def : GlobalLoadSaddrPat_M0<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
}
}
multiclass GlobalFLATLoadPats_CPOL<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
def : FlatLoadSignedPat_CPOL<inst, node, vt> {
let AddedComplexity = 10;
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
}
def : GlobalLoadSaddrPat_CPOL<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 11;
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
}
}
@@ -1701,10 +1731,14 @@ multiclass GlobalFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Value
multiclass GlobalFLATStorePats<FLAT_Pseudo inst, SDPatternOperator node,
ValueType vt> {
def : FlatStoreSignedPat <inst, node, vt> {
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
let AddedComplexity = 10;
}
def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
+ let SubtargetPredicate = inst.SubtargetPredicate;
+ let OtherPredicates = inst.OtherPredicates;
let AddedComplexity = 11;
}
}
@@ -1849,7 +1883,9 @@ multiclass ScratchFLATLoadPats_D16_t16<string inst, SDPatternOperator node, Valu
}
multiclass FlatLoadPats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
- def : FlatLoadPat <inst, node, vt>;
+ def : FlatLoadPat <inst, node, vt> {
+ let OtherPredicates = [HasFlatAddressSpace];
+ }
def : FlatLoadSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 9;
@@ -1876,7 +1912,9 @@ multiclass FlatLoadPats_D16_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueT
}
multiclass FlatStorePats<FLAT_Pseudo inst, SDPatternOperator node, ValueType vt> {
- def : FlatStorePat <inst, node, vt>;
+ def : FlatStorePat <inst, node, vt> {
+ let OtherPredicates = [HasFlatAddressSpace];
+ }
def : FlatStoreSaddrPat<!cast<FLAT_Pseudo>(!cast<string>(inst)#"_SADDR"), node, vt> {
let AddedComplexity = 9;
@@ -1893,8 +1931,6 @@ multiclass FlatStorePats_t16<FLAT_Pseudo inst, SDPatternOperator node, ValueType
}
}
-let OtherPredicates = [HasFlatAddressSpace] in {
-
defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_aext_8_flat, i32>;
defm : FlatLoadPats <FLAT_LOAD_UBYTE, atomic_load_zext_8_flat, i32>;
defm : FlatLoadPats <FLAT_LOAD_USHORT, atomic_load_aext_16_flat, i32>;
@@ -2018,12 +2054,7 @@ defm : FlatAtomicPat <"FLAT_ATOMIC_MAX_F64", "atomic_load_fmax_"#as, f64>;
defm : FlatStorePats <FLAT_STORE_BYTE, truncstorei8_flat, i16>;
defm : FlatStorePats <FLAT_STORE_SHORT, store_flat, i16>;
-} // End OtherPredicates = [HasFlatAddressSpace]
-
-let OtherPredicates = [isGFX12Plus] in
defm : FlatAtomicRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
-
-let OtherPredicates = [isGFX12Plus, HasAtomicCSubNoRtnInsts] in
defm : FlatAtomicNoRtnPatWithAddrSpace<"FLAT_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "flat_addrspace", i32>;
let OtherPredicates = [HasD16LoadStore] in {
@@ -2048,8 +2079,6 @@ defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2i16>;
defm : FlatLoadPats_D16 <FLAT_LOAD_SHORT_D16, load_d16_lo_flat, v2f16>;
}
-let OtherPredicates = [HasFlatGlobalInsts] in {
-
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_aext_8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, atomic_load_zext_8_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_aext_16_global, i32>;
@@ -2063,7 +2092,7 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, zextloadi16_global, i32>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SSHORT, sextloadi16_global, i32>;
foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
-let OtherPredicates = [HasFlatGlobalInsts], True16Predicate = p in {
+let True16Predicate = p in {
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, extloadi8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_UBYTE, zextloadi8_global, i16>;
defm : GlobalFLATLoadPats <GLOBAL_LOAD_SBYTE, sextloadi8_global, i16>;
@@ -2077,7 +2106,7 @@ defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_nonext_16_global, i16
defm : GlobalFLATLoadPats <GLOBAL_LOAD_USHORT, atomic_load_zext_16_global, i16>;
}
-let OtherPredicates = [HasFlatGlobalInsts, D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
+let OtherPredicates = [D16PreservesUnusedBits], True16Predicate = UseRealTrue16Insts in {
defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", extloadi8_global, i16>;
defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_UBYTE_D16", zextloadi8_global, i16>;
defm : GlobalFLATLoadPats_D16_t16<"GLOBAL_LOAD_SBYTE_D16", sextloadi8_global, i16>;
@@ -2174,7 +2203,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_CMPSWAP", "AMDGPUatomic_cmp_swap_glo
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR", "atomic_load_xor_global", i32>;
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
-let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_CSUB", "int_amdgcn_global_atomic_csub", i32, i32, /* isIntr */ 1>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_X2", "atomic_load_add_global", i64>;
@@ -2194,7 +2223,7 @@ defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_XOR_X2", "atomic_load_xor_global", i
let SubtargetPredicate = isGFX12Plus in {
defm : GlobalFLATAtomicPatsRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
- let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
+ let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
defm : GlobalFLATAtomicPatsNoRtnWithAddrSpace <"GLOBAL_ATOMIC_COND_SUB_U32", "int_amdgcn_atomic_cond_sub_u32", "global_addrspace", i32>;
}
@@ -2249,62 +2278,38 @@ let OtherPredicates = [isGFX1250Plus] in {
defm : GlobalStoreLDSPats <GLOBAL_STORE_ASYNC_FROM_LDS_B128, int_amdgcn_global_store_async_from_lds_b128>;
}
-let SubtargetPredicate = HasAtomicFMinFMaxF32GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMIN", "atomic_load_fmin_global", f32>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_FMAX", "atomic_load_fmax_global", f32>;
-}
-
-let SubtargetPredicate = HasAtomicFMinFMaxF32FlatInsts in {
defm : FlatAtomicPat <"FLAT_ATOMIC_FMIN", "atomic_load_fmin_flat", f32>;
defm : FlatAtomicPat <"FLAT_ATOMIC_FMAX", "atomic_load_fmax_flat", f32>;
-}
-let OtherPredicates = [isGFX12Only] in {
- // FIXME: Remove these intrinsics
- defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>;
- defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>;
- defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
- defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>;
+// FIXME: Remove these intrinsics
+let SubtargetPredicate = isGFX12Only in {
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMIN", "int_amdgcn_global_atomic_fmin_num", f32>;
+defm : GlobalFLATAtomicIntrPats <"GLOBAL_ATOMIC_FMAX", "int_amdgcn_global_atomic_fmax_num", f32>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMIN", "int_amdgcn_flat_atomic_fmin_num", f32>;
+defm : FlatAtomicIntrPat <"FLAT_ATOMIC_FMAX", "int_amdgcn_flat_atomic_fmax_num", f32>;
}
-let OtherPredicates = [HasAtomicFaddNoRtnInsts] in {
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
-}
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16NoRtnInsts] in {
defm : GlobalFLATAtomicPatsNoRtn <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
-}
-let OtherPredicates = [HasAtomicFaddRtnInsts] in {
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ADD_F32", "atomic_load_fadd_global", f32>;
-}
-let OtherPredicates = [HasAtomicBufferGlobalPkAddF16Insts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_F16", "atomic_load_fadd_global", v2f16>;
-}
-let SubtargetPredicate = HasAtomicFMinFMaxF64GlobalInsts, OtherPredicates = [HasFlatGlobalInsts] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MIN_F64", "atomic_load_fmin_global", f64>;
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_MAX_F64", "atomic_load_fmax_global", f64>;
-}
-let OtherPredicates = [HasFlatBufferGlobalAtomicFaddF64Inst] in {
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_ADD_F64", "atomic_load_fadd_global", f64>;
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F64", "atomic_load_fadd_flat", f64>;
-}
-let OtherPredicates = [HasFlatAtomicFaddF32Inst] in {
defm : FlatAtomicPat <"FLAT_ATOMIC_ADD_F32", "atomic_load_fadd_flat", f32>;
-}
-
-let OtherPredicates = [HasAtomicFlatPkAdd16Insts] in {
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_F16", "atomic_load_fadd_flat", v2f16>;
defm : FlatAtomicPat <"FLAT_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_flat", v2bf16>;
-}
-let OtherPredicates = [HasAtomicGlobalPkAddBF16Inst] in
defm : GlobalFLATAtomicPats <"GLOBAL_ATOMIC_PK_ADD_BF16", "atomic_load_fadd_global", v2bf16>;
-} // End OtherPredicates = [HasFlatGlobalInsts], AddedComplexity = 10
let OtherPredicates = [HasFlatScratchInsts, EnableFlatScratch] in {
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 004d3c0c1cf53..3dedf008c917e 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -1,8 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-SDAG %s
+; RUN: llc -global-isel -global-isel-abort=2 -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx950 < %s | FileCheck -check-prefixes=GFX950,GFX950-GISEL %s
-; Test using saddr addressing mode of flat_* atomic instructions.
+; Test using saddr addressing mode of flat_* atomic instructions. Make
+; sure these are not incorrectly selected before gfx1250.
define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
; GFX1250-LABEL: flat_xchg_saddr_i32_nortn:
@@ -11,6 +14,29 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst
@@ -25,6 +51,29 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_nortn_offset_2047:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 offset:2047
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_nortn_offset_2047:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 offset:2047
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 2047
@@ -40,6 +89,35 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xfffff800, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048
@@ -55,6 +133,29 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset,
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw xchg ptr %gep0, i32 %data syncscope("agent") seq_cst
@@ -70,6 +171,29 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voff
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn_2048:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:2048 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_rtn_2048:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 offset:2048 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 2048
@@ -86,6 +210,35 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %v
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_xchg_saddr_i32_rtn_neg2048:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffff800, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_xchg_saddr_i32_rtn_neg2048:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xfffff800, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -2048
@@ -128,6 +281,33 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%sbase = load ptr, ptr addrspace(3) @ptr.in.lds
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -164,6 +344,33 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_swap v0, v[0:1], v2 offset:42 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_swap v0, v[2:3], v1 offset:42 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%sbase = load ptr, ptr addrspace(3) @ptr.in.lds
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -199,6 +406,33 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset,
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%sbase = load ptr, ptr addrspace(3) @ptr.in.lds
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -232,6 +466,33 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32
; GFX1250-GISEL-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-GISEL-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: ds_read_b64 v[4:5], v1
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_swap v[0:1], v2 offset:42
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-GISEL-NEXT: ds_read_b64 v[2:3], v2
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_swap v[2:3], v1 offset:42
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%sbase = load ptr, ptr addrspace(3) @ptr.in.lds
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
@@ -338,6 +599,86 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB10_5
; GFX1250-GISEL-NEXT: .LBB10_5:
+;
+; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB10_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB10_4
+; GFX950-SDAG-NEXT: .LBB10_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_branch .LBB10_5
+; GFX950-SDAG-NEXT: .LBB10_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB10_2
+; GFX950-SDAG-NEXT: .LBB10_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_nop 0
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB10_5
+; GFX950-SDAG-NEXT: .LBB10_5:
+;
+; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB10_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB10_4
+; GFX950-GISEL-NEXT: .LBB10_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_branch .LBB10_5
+; GFX950-GISEL-NEXT: .LBB10_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB10_2
+; GFX950-GISEL-NEXT: .LBB10_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[4:5], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB10_5
+; GFX950-GISEL-NEXT: .LBB10_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw xchg ptr %gep0, i64 %data syncscope("agent") seq_cst
@@ -441,6 +782,92 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
; GFX1250-GISEL-NEXT: s_branch .LBB11_5
; GFX1250-GISEL-NEXT: .LBB11_5:
+;
+; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB11_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB11_4
+; GFX950-SDAG-NEXT: .LBB11_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_branch .LBB11_5
+; GFX950-SDAG-NEXT: .LBB11_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB11_2
+; GFX950-SDAG-NEXT: .LBB11_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_nop 0
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB11_5
+; GFX950-SDAG-NEXT: .LBB11_5:
+;
+; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB11_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB11_4
+; GFX950-GISEL-NEXT: .LBB11_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_branch .LBB11_5
+; GFX950-GISEL-NEXT: .LBB11_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB11_2
+; GFX950-GISEL-NEXT: .LBB11_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[4:5], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB11_5
+; GFX950-GISEL-NEXT: .LBB11_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -522,6 +949,72 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB12_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB12_4
+; GFX950-SDAG-NEXT: .LBB12_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB12_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB12_2
+; GFX950-SDAG-NEXT: .LBB12_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v0, v[2:3], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB12_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB12_4
+; GFX950-GISEL-NEXT: .LBB12_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB12_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB12_2
+; GFX950-GISEL-NEXT: .LBB12_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v0, v[4:5], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw xchg ptr %gep0, i64 %data syncscope("agent") seq_cst
@@ -607,6 +1100,78 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v0, v[4:5], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_xchg_saddr_i64_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB13_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB13_4
+; GFX950-SDAG-NEXT: .LBB13_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB13_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB13_2
+; GFX950-SDAG-NEXT: .LBB13_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v0, v[2:3], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB13_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB13_4
+; GFX950-GISEL-NEXT: .LBB13_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB13_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_swap_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB13_2
+; GFX950-GISEL-NEXT: .LBB13_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v0, v[4:5], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -626,6 +1191,29 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_add_saddr_i32_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_add v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_add_saddr_i32_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_add v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw add ptr %gep0, i32 %data syncscope("agent") seq_cst
@@ -641,6 +1229,35 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_add_saddr_i32_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_add v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_add_saddr_i32_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_add v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -656,6 +1273,29 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_add_saddr_i32_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_add v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_add_saddr_i32_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_add v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw add ptr %gep0, i32 %data syncscope("agent") seq_cst
@@ -669,6 +1309,35 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_add_saddr_i32_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_add v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_add_saddr_i32_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_add v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -766,6 +1435,90 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB18_5
; GFX1250-GISEL-NEXT: .LBB18_5:
+;
+; GFX950-SDAG-LABEL: flat_add_saddr_i64_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB18_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB18_4
+; GFX950-SDAG-NEXT: .LBB18_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_branch .LBB18_5
+; GFX950-SDAG-NEXT: .LBB18_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB18_2
+; GFX950-SDAG-NEXT: .LBB18_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB18_5
+; GFX950-SDAG-NEXT: .LBB18_5:
+;
+; GFX950-GISEL-LABEL: flat_add_saddr_i64_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB18_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB18_4
+; GFX950-GISEL-NEXT: .LBB18_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_branch .LBB18_5
+; GFX950-GISEL-NEXT: .LBB18_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB18_2
+; GFX950-GISEL-NEXT: .LBB18_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v0, v4
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB18_5
+; GFX950-GISEL-NEXT: .LBB18_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw add ptr %gep0, i64 %data syncscope("agent") seq_cst
@@ -869,6 +1622,96 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB19_5
; GFX1250-GISEL-NEXT: .LBB19_5:
+;
+; GFX950-SDAG-LABEL: flat_add_saddr_i64_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB19_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB19_4
+; GFX950-SDAG-NEXT: .LBB19_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_branch .LBB19_5
+; GFX950-SDAG-NEXT: .LBB19_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB19_2
+; GFX950-SDAG-NEXT: .LBB19_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB19_5
+; GFX950-SDAG-NEXT: .LBB19_5:
+;
+; GFX950-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB19_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB19_4
+; GFX950-GISEL-NEXT: .LBB19_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_branch .LBB19_5
+; GFX950-GISEL-NEXT: .LBB19_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB19_2
+; GFX950-GISEL-NEXT: .LBB19_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v0, v4
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB19_5
+; GFX950-GISEL-NEXT: .LBB19_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -956,6 +1799,80 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_add_saddr_i64_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB20_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB20_4
+; GFX950-SDAG-NEXT: .LBB20_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB20_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB20_2
+; GFX950-SDAG-NEXT: .LBB20_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_add_saddr_i64_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB20_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB20_4
+; GFX950-GISEL-NEXT: .LBB20_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB20_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB20_2
+; GFX950-GISEL-NEXT: .LBB20_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw add ptr %gep0, i64 %data syncscope("agent") seq_cst
@@ -1047,6 +1964,86 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_add_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_add_saddr_i64_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB21_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB21_4
+; GFX950-SDAG-NEXT: .LBB21_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB21_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_add_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB21_2
+; GFX950-SDAG-NEXT: .LBB21_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB21_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB21_4
+; GFX950-GISEL-NEXT: .LBB21_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB21_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_add_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB21_2
+; GFX950-GISEL-NEXT: .LBB21_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -1066,6 +2063,29 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_sub_saddr_i32_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_sub v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_sub_saddr_i32_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_sub v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw sub ptr %gep0, i32 %data syncscope("agent") seq_cst
@@ -1081,6 +2101,35 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_sub_saddr_i32_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_sub v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_sub_saddr_i32_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_sub v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -1096,6 +2145,29 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_sub_saddr_i32_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_sub v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_sub_saddr_i32_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_sub v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw sub ptr %gep0, i32 %data syncscope("agent") seq_cst
@@ -1109,6 +2181,35 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_sub_saddr_i32_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_sub v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_sub_saddr_i32_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_sub v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -1206,6 +2307,92 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB26_5
; GFX1250-GISEL-NEXT: .LBB26_5:
+;
+; GFX950-SDAG-LABEL: flat_sub_saddr_i64_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB26_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB26_4
+; GFX950-SDAG-NEXT: .LBB26_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_branch .LBB26_5
+; GFX950-SDAG-NEXT: .LBB26_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB26_2
+; GFX950-SDAG-NEXT: .LBB26_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB26_5
+; GFX950-SDAG-NEXT: .LBB26_5:
+;
+; GFX950-GISEL-LABEL: flat_sub_saddr_i64_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB26_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB26_4
+; GFX950-GISEL-NEXT: .LBB26_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_branch .LBB26_5
+; GFX950-GISEL-NEXT: .LBB26_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB26_2
+; GFX950-GISEL-NEXT: .LBB26_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB26_5
+; GFX950-GISEL-NEXT: .LBB26_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw sub ptr %gep0, i64 %data syncscope("agent") seq_cst
@@ -1309,6 +2496,98 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB27_5
; GFX1250-GISEL-NEXT: .LBB27_5:
+;
+; GFX950-SDAG-LABEL: flat_sub_saddr_i64_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB27_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB27_4
+; GFX950-SDAG-NEXT: .LBB27_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_branch .LBB27_5
+; GFX950-SDAG-NEXT: .LBB27_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB27_2
+; GFX950-SDAG-NEXT: .LBB27_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v2
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB27_5
+; GFX950-SDAG-NEXT: .LBB27_5:
+;
+; GFX950-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB27_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB27_4
+; GFX950-GISEL-NEXT: .LBB27_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_branch .LBB27_5
+; GFX950-GISEL-NEXT: .LBB27_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB27_2
+; GFX950-GISEL-NEXT: .LBB27_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v2, vcc, v0, v4
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v5, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB27_5
+; GFX950-GISEL-NEXT: .LBB27_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -1396,6 +2675,82 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_sub_saddr_i64_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB28_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB28_4
+; GFX950-SDAG-NEXT: .LBB28_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB28_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB28_2
+; GFX950-SDAG-NEXT: .LBB28_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_sub_saddr_i64_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB28_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB28_4
+; GFX950-GISEL-NEXT: .LBB28_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB28_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB28_2
+; GFX950-GISEL-NEXT: .LBB28_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw sub ptr %gep0, i64 %data syncscope("agent") seq_cst
@@ -1487,6 +2842,88 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_sub_nc_u64_e32 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_sub_saddr_i64_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB29_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB29_4
+; GFX950-SDAG-NEXT: .LBB29_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB29_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB29_2
+; GFX950-SDAG-NEXT: .LBB29_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB29_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB29_4
+; GFX950-GISEL-NEXT: .LBB29_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB29_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_sub_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB29_2
+; GFX950-GISEL-NEXT: .LBB29_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -1506,6 +2943,29 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_and_saddr_i32_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_and v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_and_saddr_i32_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_and v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw and ptr %gep0, i32 %data syncscope("agent") seq_cst
@@ -1521,6 +2981,35 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_and_saddr_i32_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_and v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_and_saddr_i32_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_and v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -1536,6 +3025,29 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_and_saddr_i32_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_and v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_and_saddr_i32_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_and v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw and ptr %gep0, i32 %data syncscope("agent") seq_cst
@@ -1549,6 +3061,35 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_and_saddr_i32_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_and v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_and_saddr_i32_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_and v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -1648,6 +3189,90 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB34_5
; GFX1250-GISEL-NEXT: .LBB34_5:
+;
+; GFX950-SDAG-LABEL: flat_and_saddr_i64_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB34_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB34_4
+; GFX950-SDAG-NEXT: .LBB34_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_branch .LBB34_5
+; GFX950-SDAG-NEXT: .LBB34_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB34_2
+; GFX950-SDAG-NEXT: .LBB34_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_and_b32_e32 v3, v1, v3
+; GFX950-SDAG-NEXT: v_and_b32_e32 v2, v0, v2
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB34_5
+; GFX950-SDAG-NEXT: .LBB34_5:
+;
+; GFX950-GISEL-LABEL: flat_and_saddr_i64_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB34_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB34_4
+; GFX950-GISEL-NEXT: .LBB34_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_branch .LBB34_5
+; GFX950-GISEL-NEXT: .LBB34_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB34_2
+; GFX950-GISEL-NEXT: .LBB34_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_and_b32_e32 v2, v0, v4
+; GFX950-GISEL-NEXT: v_and_b32_e32 v3, v1, v5
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB34_5
+; GFX950-GISEL-NEXT: .LBB34_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw and ptr %gep0, i64 %data syncscope("agent") seq_cst
@@ -1753,6 +3378,96 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB35_5
; GFX1250-GISEL-NEXT: .LBB35_5:
+;
+; GFX950-SDAG-LABEL: flat_and_saddr_i64_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB35_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB35_4
+; GFX950-SDAG-NEXT: .LBB35_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_branch .LBB35_5
+; GFX950-SDAG-NEXT: .LBB35_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB35_2
+; GFX950-SDAG-NEXT: .LBB35_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_and_b32_e32 v3, v1, v3
+; GFX950-SDAG-NEXT: v_and_b32_e32 v2, v0, v2
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB35_5
+; GFX950-SDAG-NEXT: .LBB35_5:
+;
+; GFX950-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB35_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB35_4
+; GFX950-GISEL-NEXT: .LBB35_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_branch .LBB35_5
+; GFX950-GISEL-NEXT: .LBB35_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB35_2
+; GFX950-GISEL-NEXT: .LBB35_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_and_b32_e32 v2, v0, v4
+; GFX950-GISEL-NEXT: v_and_b32_e32 v3, v1, v5
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB35_5
+; GFX950-GISEL-NEXT: .LBB35_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -1842,6 +3557,80 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_and_saddr_i64_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB36_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB36_4
+; GFX950-SDAG-NEXT: .LBB36_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB36_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB36_2
+; GFX950-SDAG-NEXT: .LBB36_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_and_b32_e32 v1, v1, v3
+; GFX950-SDAG-NEXT: v_and_b32_e32 v0, v0, v2
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_and_saddr_i64_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB36_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB36_4
+; GFX950-GISEL-NEXT: .LBB36_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB36_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB36_2
+; GFX950-GISEL-NEXT: .LBB36_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v0, v4
+; GFX950-GISEL-NEXT: v_and_b32_e32 v1, v1, v5
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw and ptr %gep0, i64 %data syncscope("agent") seq_cst
@@ -1935,6 +3724,86 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_and_b32_e32 v1, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_and_saddr_i64_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB37_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB37_4
+; GFX950-SDAG-NEXT: .LBB37_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB37_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB37_2
+; GFX950-SDAG-NEXT: .LBB37_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_and_b32_e32 v1, v1, v3
+; GFX950-SDAG-NEXT: v_and_b32_e32 v0, v0, v2
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB37_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB37_4
+; GFX950-GISEL-NEXT: .LBB37_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB37_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_and_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB37_2
+; GFX950-GISEL-NEXT: .LBB37_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_and_b32_e32 v0, v0, v4
+; GFX950-GISEL-NEXT: v_and_b32_e32 v1, v1, v5
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -1954,6 +3823,29 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i3
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_or_saddr_i32_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_or v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_or_saddr_i32_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_or v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw or ptr %gep0, i32 %data syncscope("agent") seq_cst
@@ -1969,6 +3861,35 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voff
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_or_saddr_i32_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_or v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_or_saddr_i32_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_or v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -1984,6 +3905,29 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_or_saddr_i32_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_or v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_or_saddr_i32_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_or v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw or ptr %gep0, i32 %data syncscope("agent") seq_cst
@@ -1997,6 +3941,35 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_or_saddr_i32_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_or v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_or_saddr_i32_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_or v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -2096,6 +4069,90 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB42_5
; GFX1250-GISEL-NEXT: .LBB42_5:
+;
+; GFX950-SDAG-LABEL: flat_or_saddr_i64_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB42_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB42_4
+; GFX950-SDAG-NEXT: .LBB42_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_branch .LBB42_5
+; GFX950-SDAG-NEXT: .LBB42_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB42_2
+; GFX950-SDAG-NEXT: .LBB42_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_or_b32_e32 v3, v1, v3
+; GFX950-SDAG-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB42_5
+; GFX950-SDAG-NEXT: .LBB42_5:
+;
+; GFX950-GISEL-LABEL: flat_or_saddr_i64_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB42_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB42_4
+; GFX950-GISEL-NEXT: .LBB42_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_branch .LBB42_5
+; GFX950-GISEL-NEXT: .LBB42_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB42_2
+; GFX950-GISEL-NEXT: .LBB42_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_or_b32_e32 v2, v0, v4
+; GFX950-GISEL-NEXT: v_or_b32_e32 v3, v1, v5
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB42_5
+; GFX950-GISEL-NEXT: .LBB42_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw or ptr %gep0, i64 %data syncscope("agent") seq_cst
@@ -2201,6 +4258,96 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB43_5
; GFX1250-GISEL-NEXT: .LBB43_5:
+;
+; GFX950-SDAG-LABEL: flat_or_saddr_i64_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB43_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB43_4
+; GFX950-SDAG-NEXT: .LBB43_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_branch .LBB43_5
+; GFX950-SDAG-NEXT: .LBB43_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB43_2
+; GFX950-SDAG-NEXT: .LBB43_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_or_b32_e32 v3, v1, v3
+; GFX950-SDAG-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB43_5
+; GFX950-SDAG-NEXT: .LBB43_5:
+;
+; GFX950-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB43_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB43_4
+; GFX950-GISEL-NEXT: .LBB43_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_branch .LBB43_5
+; GFX950-GISEL-NEXT: .LBB43_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB43_2
+; GFX950-GISEL-NEXT: .LBB43_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_or_b32_e32 v2, v0, v4
+; GFX950-GISEL-NEXT: v_or_b32_e32 v3, v1, v5
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB43_5
+; GFX950-GISEL-NEXT: .LBB43_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -2290,6 +4437,80 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-GISEL-NEXT: v_or_b32_e32 v1, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_or_saddr_i64_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB44_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB44_4
+; GFX950-SDAG-NEXT: .LBB44_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB44_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB44_2
+; GFX950-SDAG-NEXT: .LBB44_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX950-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_or_saddr_i64_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB44_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB44_4
+; GFX950-GISEL-NEXT: .LBB44_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB44_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB44_2
+; GFX950-GISEL-NEXT: .LBB44_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX950-GISEL-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw or ptr %gep0, i64 %data syncscope("agent") seq_cst
@@ -2383,6 +4604,86 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: v_or_b32_e32 v1, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_or_saddr_i64_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB45_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB45_4
+; GFX950-SDAG-NEXT: .LBB45_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB45_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB45_2
+; GFX950-SDAG-NEXT: .LBB45_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_or_b32_e32 v1, v1, v3
+; GFX950-SDAG-NEXT: v_or_b32_e32 v0, v0, v2
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB45_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB45_4
+; GFX950-GISEL-NEXT: .LBB45_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB45_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_or_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB45_2
+; GFX950-GISEL-NEXT: .LBB45_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_or_b32_e32 v0, v0, v4
+; GFX950-GISEL-NEXT: v_or_b32_e32 v1, v1, v5
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -2402,6 +4703,29 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_xor_saddr_i32_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_xor_saddr_i32_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_xor v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw xor ptr %gep0, i32 %data syncscope("agent") seq_cst
@@ -2417,6 +4741,35 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_xor_saddr_i32_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_xor v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_xor_saddr_i32_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_xor v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -2432,6 +4785,29 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_xor_saddr_i32_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_xor v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_xor_saddr_i32_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_xor v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw xor ptr %gep0, i32 %data syncscope("agent") seq_cst
@@ -2445,6 +4821,35 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_xor_saddr_i32_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_xor v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_xor_saddr_i32_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_xor v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -2544,6 +4949,90 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB50_5
; GFX1250-GISEL-NEXT: .LBB50_5:
+;
+; GFX950-SDAG-LABEL: flat_xor_saddr_i64_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB50_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB50_4
+; GFX950-SDAG-NEXT: .LBB50_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_branch .LBB50_5
+; GFX950-SDAG-NEXT: .LBB50_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB50_2
+; GFX950-SDAG-NEXT: .LBB50_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3
+; GFX950-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB50_5
+; GFX950-SDAG-NEXT: .LBB50_5:
+;
+; GFX950-GISEL-LABEL: flat_xor_saddr_i64_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB50_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB50_4
+; GFX950-GISEL-NEXT: .LBB50_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_branch .LBB50_5
+; GFX950-GISEL-NEXT: .LBB50_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB50_2
+; GFX950-GISEL-NEXT: .LBB50_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4
+; GFX950-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB50_5
+; GFX950-GISEL-NEXT: .LBB50_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw xor ptr %gep0, i64 %data syncscope("agent") seq_cst
@@ -2649,6 +5138,96 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB51_5
; GFX1250-GISEL-NEXT: .LBB51_5:
+;
+; GFX950-SDAG-LABEL: flat_xor_saddr_i64_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB51_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB51_4
+; GFX950-SDAG-NEXT: .LBB51_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_branch .LBB51_5
+; GFX950-SDAG-NEXT: .LBB51_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB51_2
+; GFX950-SDAG-NEXT: .LBB51_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_xor_b32_e32 v3, v1, v3
+; GFX950-SDAG-NEXT: v_xor_b32_e32 v2, v0, v2
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB51_5
+; GFX950-SDAG-NEXT: .LBB51_5:
+;
+; GFX950-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB51_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB51_4
+; GFX950-GISEL-NEXT: .LBB51_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_branch .LBB51_5
+; GFX950-GISEL-NEXT: .LBB51_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB51_2
+; GFX950-GISEL-NEXT: .LBB51_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_xor_b32_e32 v2, v0, v4
+; GFX950-GISEL-NEXT: v_xor_b32_e32 v3, v1, v5
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB51_5
+; GFX950-GISEL-NEXT: .LBB51_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -2738,6 +5317,80 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_xor_saddr_i64_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB52_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB52_4
+; GFX950-SDAG-NEXT: .LBB52_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB52_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB52_2
+; GFX950-SDAG-NEXT: .LBB52_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3
+; GFX950-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_xor_saddr_i64_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB52_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB52_4
+; GFX950-GISEL-NEXT: .LBB52_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB52_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB52_2
+; GFX950-GISEL-NEXT: .LBB52_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
+; GFX950-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw xor ptr %gep0, i64 %data syncscope("agent") seq_cst
@@ -2831,6 +5484,86 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_xor_saddr_i64_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB53_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB53_4
+; GFX950-SDAG-NEXT: .LBB53_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB53_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc1
+; GFX950-SDAG-NEXT: flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB53_2
+; GFX950-SDAG-NEXT: .LBB53_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3
+; GFX950-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB53_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB53_4
+; GFX950-GISEL-NEXT: .LBB53_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB53_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc1
+; GFX950-GISEL-NEXT: flat_atomic_xor_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB53_2
+; GFX950-GISEL-NEXT: .LBB53_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_xor_b32_e32 v0, v0, v4
+; GFX950-GISEL-NEXT: v_xor_b32_e32 v1, v1, v5
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -2848,6 +5581,25 @@ define amdgpu_ps float @flat_max_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-NEXT: flat_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_max_saddr_i32_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_smax v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_max_saddr_i32_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_smax v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw max ptr %gep0, i32 %data syncscope("workgroup") seq_cst
@@ -2861,6 +5613,31 @@ define amdgpu_ps float @flat_max_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-NEXT: flat_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_max_saddr_i32_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: flat_atomic_smax v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_max_saddr_i32_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_smax v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -2875,6 +5652,25 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_smax v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_smax v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw max ptr %gep0, i32 %data syncscope("workgroup") seq_cst
@@ -2887,6 +5683,31 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-NEXT: flat_atomic_max_i32 v0, v1, s[2:3] offset:-128
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_max_saddr_i32_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: flat_atomic_smax v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_max_saddr_i32_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_smax v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -2986,6 +5807,92 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB58_5
; GFX1250-GISEL-NEXT: .LBB58_5:
+;
+; GFX950-SDAG-LABEL: flat_max_saddr_i64_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB58_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB58_4
+; GFX950-SDAG-NEXT: .LBB58_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB58_5
+; GFX950-SDAG-NEXT: .LBB58_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB58_2
+; GFX950-SDAG-NEXT: .LBB58_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB58_5
+; GFX950-SDAG-NEXT: .LBB58_5:
+;
+; GFX950-GISEL-LABEL: flat_max_saddr_i64_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB58_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB58_4
+; GFX950-GISEL-NEXT: .LBB58_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB58_5
+; GFX950-GISEL-NEXT: .LBB58_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB58_2
+; GFX950-GISEL-NEXT: .LBB58_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB58_5
+; GFX950-GISEL-NEXT: .LBB58_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw max ptr %gep0, i64 %data syncscope("workgroup") seq_cst
@@ -3091,6 +5998,98 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB59_5
; GFX1250-GISEL-NEXT: .LBB59_5:
+;
+; GFX950-SDAG-LABEL: flat_max_saddr_i64_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB59_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB59_4
+; GFX950-SDAG-NEXT: .LBB59_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB59_5
+; GFX950-SDAG-NEXT: .LBB59_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB59_2
+; GFX950-SDAG-NEXT: .LBB59_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB59_5
+; GFX950-SDAG-NEXT: .LBB59_5:
+;
+; GFX950-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB59_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB59_4
+; GFX950-GISEL-NEXT: .LBB59_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB59_5
+; GFX950-GISEL-NEXT: .LBB59_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB59_2
+; GFX950-GISEL-NEXT: .LBB59_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB59_5
+; GFX950-GISEL-NEXT: .LBB59_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -3176,6 +6175,80 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: v_max_i64 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_max_saddr_i64_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB60_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB60_4
+; GFX950-SDAG-NEXT: .LBB60_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB60_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB60_2
+; GFX950-SDAG-NEXT: .LBB60_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_max_saddr_i64_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB60_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB60_4
+; GFX950-GISEL-NEXT: .LBB60_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB60_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB60_2
+; GFX950-GISEL-NEXT: .LBB60_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw max ptr %gep0, i64 %data syncscope("workgroup") seq_cst
@@ -3265,6 +6338,86 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_max_i64 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_max_saddr_i64_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB61_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB61_4
+; GFX950-SDAG-NEXT: .LBB61_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB61_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_smax_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB61_2
+; GFX950-SDAG-NEXT: .LBB61_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB61_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB61_4
+; GFX950-GISEL-NEXT: .LBB61_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB61_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_smax_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB61_2
+; GFX950-GISEL-NEXT: .LBB61_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_gt_i64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -3282,6 +6435,25 @@ define amdgpu_ps float @flat_min_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-NEXT: flat_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_min_saddr_i32_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_smin v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_min_saddr_i32_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_smin v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw min ptr %gep0, i32 %data syncscope("workgroup") seq_cst
@@ -3295,6 +6467,31 @@ define amdgpu_ps float @flat_min_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-NEXT: flat_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_min_saddr_i32_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: flat_atomic_smin v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_min_saddr_i32_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_smin v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -3309,6 +6506,25 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_smin v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_smin v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw min ptr %gep0, i32 %data syncscope("workgroup") seq_cst
@@ -3321,6 +6537,31 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-NEXT: flat_atomic_min_i32 v0, v1, s[2:3] offset:-128
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_min_saddr_i32_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: flat_atomic_smin v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_min_saddr_i32_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_smin v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -3420,6 +6661,92 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB66_5
; GFX1250-GISEL-NEXT: .LBB66_5:
+;
+; GFX950-SDAG-LABEL: flat_min_saddr_i64_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB66_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB66_4
+; GFX950-SDAG-NEXT: .LBB66_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB66_5
+; GFX950-SDAG-NEXT: .LBB66_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB66_2
+; GFX950-SDAG-NEXT: .LBB66_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB66_5
+; GFX950-SDAG-NEXT: .LBB66_5:
+;
+; GFX950-GISEL-LABEL: flat_min_saddr_i64_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB66_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB66_4
+; GFX950-GISEL-NEXT: .LBB66_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB66_5
+; GFX950-GISEL-NEXT: .LBB66_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB66_2
+; GFX950-GISEL-NEXT: .LBB66_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB66_5
+; GFX950-GISEL-NEXT: .LBB66_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw min ptr %gep0, i64 %data syncscope("workgroup") seq_cst
@@ -3525,6 +6852,98 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB67_5
; GFX1250-GISEL-NEXT: .LBB67_5:
+;
+; GFX950-SDAG-LABEL: flat_min_saddr_i64_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB67_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB67_4
+; GFX950-SDAG-NEXT: .LBB67_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB67_5
+; GFX950-SDAG-NEXT: .LBB67_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB67_2
+; GFX950-SDAG-NEXT: .LBB67_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB67_5
+; GFX950-SDAG-NEXT: .LBB67_5:
+;
+; GFX950-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB67_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB67_4
+; GFX950-GISEL-NEXT: .LBB67_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB67_5
+; GFX950-GISEL-NEXT: .LBB67_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB67_2
+; GFX950-GISEL-NEXT: .LBB67_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB67_5
+; GFX950-GISEL-NEXT: .LBB67_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -3610,6 +7029,80 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: v_min_i64 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_min_saddr_i64_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB68_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB68_4
+; GFX950-SDAG-NEXT: .LBB68_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB68_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB68_2
+; GFX950-SDAG-NEXT: .LBB68_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_min_saddr_i64_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB68_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB68_4
+; GFX950-GISEL-NEXT: .LBB68_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB68_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB68_2
+; GFX950-GISEL-NEXT: .LBB68_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw min ptr %gep0, i64 %data syncscope("workgroup") seq_cst
@@ -3699,6 +7192,86 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_min_i64 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_min_saddr_i64_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB69_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB69_4
+; GFX950-SDAG-NEXT: .LBB69_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB69_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_smin_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB69_2
+; GFX950-SDAG-NEXT: .LBB69_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_le_i64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB69_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB69_4
+; GFX950-GISEL-NEXT: .LBB69_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB69_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_smin_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB69_2
+; GFX950-GISEL-NEXT: .LBB69_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_lt_i64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -3716,6 +7289,25 @@ define amdgpu_ps float @flat_umax_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset,
; GFX1250-NEXT: flat_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_umax_saddr_i32_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_umax v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_umax_saddr_i32_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_umax v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw umax ptr %gep0, i32 %data syncscope("workgroup") seq_cst
@@ -3729,6 +7321,31 @@ define amdgpu_ps float @flat_umax_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-NEXT: flat_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_umax_saddr_i32_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: flat_atomic_umax v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_umax_saddr_i32_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_umax v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -3743,6 +7360,25 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_umax v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_umax v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw umax ptr %gep0, i32 %data syncscope("workgroup") seq_cst
@@ -3755,6 +7391,31 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-NEXT: flat_atomic_max_u32 v0, v1, s[2:3] offset:-128
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_umax_saddr_i32_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: flat_atomic_umax v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_umax_saddr_i32_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_umax v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -3854,6 +7515,92 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB74_5
; GFX1250-GISEL-NEXT: .LBB74_5:
+;
+; GFX950-SDAG-LABEL: flat_umax_saddr_i64_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB74_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB74_4
+; GFX950-SDAG-NEXT: .LBB74_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB74_5
+; GFX950-SDAG-NEXT: .LBB74_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB74_2
+; GFX950-SDAG-NEXT: .LBB74_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB74_5
+; GFX950-SDAG-NEXT: .LBB74_5:
+;
+; GFX950-GISEL-LABEL: flat_umax_saddr_i64_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB74_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB74_4
+; GFX950-GISEL-NEXT: .LBB74_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB74_5
+; GFX950-GISEL-NEXT: .LBB74_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB74_2
+; GFX950-GISEL-NEXT: .LBB74_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB74_5
+; GFX950-GISEL-NEXT: .LBB74_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw umax ptr %gep0, i64 %data syncscope("workgroup") seq_cst
@@ -3959,6 +7706,98 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB75_5
; GFX1250-GISEL-NEXT: .LBB75_5:
+;
+; GFX950-SDAG-LABEL: flat_umax_saddr_i64_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB75_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB75_4
+; GFX950-SDAG-NEXT: .LBB75_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB75_5
+; GFX950-SDAG-NEXT: .LBB75_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB75_2
+; GFX950-SDAG-NEXT: .LBB75_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB75_5
+; GFX950-SDAG-NEXT: .LBB75_5:
+;
+; GFX950-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB75_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB75_4
+; GFX950-GISEL-NEXT: .LBB75_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB75_5
+; GFX950-GISEL-NEXT: .LBB75_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB75_2
+; GFX950-GISEL-NEXT: .LBB75_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB75_5
+; GFX950-GISEL-NEXT: .LBB75_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -4044,6 +7883,80 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: v_max_u64 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_umax_saddr_i64_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB76_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB76_4
+; GFX950-SDAG-NEXT: .LBB76_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB76_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB76_2
+; GFX950-SDAG-NEXT: .LBB76_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_umax_saddr_i64_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB76_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB76_4
+; GFX950-GISEL-NEXT: .LBB76_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB76_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB76_2
+; GFX950-GISEL-NEXT: .LBB76_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw umax ptr %gep0, i64 %data syncscope("workgroup") seq_cst
@@ -4133,6 +8046,86 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: v_max_u64 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_umax_saddr_i64_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB77_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB77_4
+; GFX950-SDAG-NEXT: .LBB77_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB77_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_umax_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB77_2
+; GFX950-SDAG-NEXT: .LBB77_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB77_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB77_4
+; GFX950-GISEL-NEXT: .LBB77_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB77_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_umax_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB77_2
+; GFX950-GISEL-NEXT: .LBB77_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -4150,6 +8143,25 @@ define amdgpu_ps float @flat_umin_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset,
; GFX1250-NEXT: flat_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_umin_saddr_i32_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_umin v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_umin_saddr_i32_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_umin v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw umin ptr %gep0, i32 %data syncscope("workgroup") seq_cst
@@ -4163,6 +8175,31 @@ define amdgpu_ps float @flat_umin_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-NEXT: flat_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_umin_saddr_i32_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: flat_atomic_umin v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_umin_saddr_i32_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_umin v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -4177,6 +8214,25 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3]
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_umin v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_umin v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw umin ptr %gep0, i32 %data syncscope("workgroup") seq_cst
@@ -4189,6 +8245,31 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-NEXT: flat_atomic_min_u32 v0, v1, s[2:3] offset:-128
; GFX1250-NEXT: s_wait_dscnt 0x0
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_umin_saddr_i32_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: flat_atomic_umin v[0:1], v2
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_umin_saddr_i32_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_umin v[2:3], v1
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -4288,6 +8369,92 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB82_5
; GFX1250-GISEL-NEXT: .LBB82_5:
+;
+; GFX950-SDAG-LABEL: flat_umin_saddr_i64_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB82_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB82_4
+; GFX950-SDAG-NEXT: .LBB82_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB82_5
+; GFX950-SDAG-NEXT: .LBB82_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB82_2
+; GFX950-SDAG-NEXT: .LBB82_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB82_5
+; GFX950-SDAG-NEXT: .LBB82_5:
+;
+; GFX950-GISEL-LABEL: flat_umin_saddr_i64_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB82_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB82_4
+; GFX950-GISEL-NEXT: .LBB82_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB82_5
+; GFX950-GISEL-NEXT: .LBB82_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB82_2
+; GFX950-GISEL-NEXT: .LBB82_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB82_5
+; GFX950-GISEL-NEXT: .LBB82_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw umin ptr %gep0, i64 %data syncscope("workgroup") seq_cst
@@ -4393,6 +8560,98 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB83_5
; GFX1250-GISEL-NEXT: .LBB83_5:
+;
+; GFX950-SDAG-LABEL: flat_umin_saddr_i64_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB83_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB83_4
+; GFX950-SDAG-NEXT: .LBB83_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB83_5
+; GFX950-SDAG-NEXT: .LBB83_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB83_2
+; GFX950-SDAG-NEXT: .LBB83_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB83_5
+; GFX950-SDAG-NEXT: .LBB83_5:
+;
+; GFX950-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB83_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB83_4
+; GFX950-GISEL-NEXT: .LBB83_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB83_5
+; GFX950-GISEL-NEXT: .LBB83_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB83_2
+; GFX950-GISEL-NEXT: .LBB83_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB83_5
+; GFX950-GISEL-NEXT: .LBB83_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -4478,6 +8737,80 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: v_min_u64 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_umin_saddr_i64_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB84_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB84_4
+; GFX950-SDAG-NEXT: .LBB84_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB84_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB84_2
+; GFX950-SDAG-NEXT: .LBB84_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_umin_saddr_i64_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB84_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB84_4
+; GFX950-GISEL-NEXT: .LBB84_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB84_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB84_2
+; GFX950-GISEL-NEXT: .LBB84_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw umin ptr %gep0, i64 %data syncscope("workgroup") seq_cst
@@ -4567,6 +8900,86 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
; GFX1250-GISEL-NEXT: v_min_u64 v[0:1], v[0:1], v[4:5]
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_umin_saddr_i64_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB85_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB85_4
+; GFX950-SDAG-NEXT: .LBB85_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB85_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_umin_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB85_2
+; GFX950-SDAG-NEXT: .LBB85_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_le_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v4, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB85_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB85_4
+; GFX950-GISEL-NEXT: .LBB85_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB85_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_umin_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB85_2
+; GFX950-GISEL-NEXT: .LBB85_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -4589,6 +9002,30 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffse
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc0 sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc0 sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%cmpxchg = cmpxchg ptr %gep0, i32 %cmp, i32 %data seq_cst seq_cst
@@ -4608,6 +9045,36 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn_neg128(ptr inreg %sbase, i32
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_wait_loadcnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc0 sc1
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[0:1], v[2:3] sc0 sc1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc0 sc1
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -4627,6 +9094,30 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc0 sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc0 sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = cmpxchg ptr %gep0, i32 %cmp, i32 %data seq_cst seq_cst
@@ -4643,6 +9134,36 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-NEXT: s_wait_storecnt_dscnt 0x0
; GFX1250-NEXT: global_inv scope:SCOPE_SYS
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i32_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc0 sc1
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i32_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[4:5], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v3, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] sc1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc0 sc1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -4748,6 +9269,98 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB90_5
; GFX1250-GISEL-NEXT: .LBB90_5:
+;
+; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB90_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB90_4
+; GFX950-SDAG-NEXT: .LBB90_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_branch .LBB90_5
+; GFX950-SDAG-NEXT: .LBB90_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] sc0 sc1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc0 sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB90_2
+; GFX950-SDAG-NEXT: .LBB90_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v8, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v8, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB90_5
+; GFX950-SDAG-NEXT: .LBB90_5:
+;
+; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB90_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB90_4
+; GFX950-GISEL-NEXT: .LBB90_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_branch .LBB90_5
+; GFX950-GISEL-NEXT: .LBB90_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] sc0 sc1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc0 sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB90_2
+; GFX950-GISEL-NEXT: .LBB90_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v1, v7, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB90_5
+; GFX950-GISEL-NEXT: .LBB90_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%cmpxchg = cmpxchg ptr %gep0, i64 %cmp, i64 %data seq_cst seq_cst
@@ -4860,6 +9473,104 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB91_5
; GFX1250-GISEL-NEXT: .LBB91_5:
+;
+; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[2:3], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB91_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB91_4
+; GFX950-SDAG-NEXT: .LBB91_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_branch .LBB91_5
+; GFX950-SDAG-NEXT: .LBB91_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] sc0 sc1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc0 sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB91_2
+; GFX950-SDAG-NEXT: .LBB91_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v8, -1, v2, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v8, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v1, v5, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v8, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB91_5
+; GFX950-SDAG-NEXT: .LBB91_5:
+;
+; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB91_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB91_4
+; GFX950-GISEL-NEXT: .LBB91_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_branch .LBB91_5
+; GFX950-GISEL-NEXT: .LBB91_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] sc0 sc1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc0 sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB91_2
+; GFX950-GISEL-NEXT: .LBB91_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v4, -1, v2, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v4, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v1, v7, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v4, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB91_5
+; GFX950-GISEL-NEXT: .LBB91_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -4956,6 +9667,88 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB92_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB92_4
+; GFX950-SDAG-NEXT: .LBB92_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB92_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:7] sc1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc0 sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB92_2
+; GFX950-SDAG-NEXT: .LBB92_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB92_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB92_4
+; GFX950-GISEL-NEXT: .LBB92_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB92_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:9] sc1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc0 sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB92_2
+; GFX950-GISEL-NEXT: .LBB92_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = cmpxchg ptr %gep0, i64 %cmp, i64 %data seq_cst seq_cst
@@ -5055,6 +9848,94 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v6 :: v_dual_cndmask_b32 v1, v1, v7
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v6, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v4
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v4, v3
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v7, v2
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB93_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB93_4
+; GFX950-SDAG-NEXT: .LBB93_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB93_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[4:7] sc1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: buffer_inv sc0 sc1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB93_2
+; GFX950-SDAG-NEXT: .LBB93_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v9, v2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v6, v3
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v8, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v7, v4
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB93_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB93_4
+; GFX950-GISEL-NEXT: .LBB93_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB93_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: buffer_wbl2 sc0 sc1
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap_x2 v[0:1], v[6:9] sc1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: buffer_inv sc0 sc1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr8_vgpr9
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB93_2
+; GFX950-GISEL-NEXT: .LBB93_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -5072,6 +9953,25 @@ define amdgpu_ps float @flat_inc_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-NEXT: flat_atomic_inc_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_inc_saddr_i32_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_inc v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_inc_saddr_i32_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_inc v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw uinc_wrap ptr %gep0, i32 %data syncscope("agent") monotonic
@@ -5085,6 +9985,31 @@ define amdgpu_ps float @flat_inc_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-NEXT: flat_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_inc_saddr_i32_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: flat_atomic_inc v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_inc_saddr_i32_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_inc v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -5098,6 +10023,23 @@ define amdgpu_ps void @flat_inc_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_inc_u32 v0, v1, s[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_inc_saddr_i32_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_inc v[0:1], v2
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_inc_saddr_i32_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_inc v[2:3], v1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw uinc_wrap ptr %gep0, i32 %data syncscope("agent") monotonic
@@ -5109,6 +10051,29 @@ define amdgpu_ps void @flat_inc_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_inc_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_inc_saddr_i32_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: flat_atomic_inc v[0:1], v2
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_inc_saddr_i32_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_inc v[2:3], v1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -5212,6 +10177,96 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB98_5
; GFX1250-GISEL-NEXT: .LBB98_5:
+;
+; GFX950-SDAG-LABEL: flat_inc_saddr_i64_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB98_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB98_4
+; GFX950-SDAG-NEXT: .LBB98_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB98_5
+; GFX950-SDAG-NEXT: .LBB98_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB98_2
+; GFX950-SDAG-NEXT: .LBB98_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
+; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB98_5
+; GFX950-SDAG-NEXT: .LBB98_5:
+;
+; GFX950-GISEL-LABEL: flat_inc_saddr_i64_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB98_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB98_4
+; GFX950-GISEL-NEXT: .LBB98_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB98_5
+; GFX950-GISEL-NEXT: .LBB98_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB98_2
+; GFX950-GISEL-NEXT: .LBB98_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB98_5
+; GFX950-GISEL-NEXT: .LBB98_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw uinc_wrap ptr %gep0, i64 %data syncscope("agent") monotonic
@@ -5321,6 +10376,102 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s0
; GFX1250-GISEL-NEXT: s_branch .LBB99_5
; GFX1250-GISEL-NEXT: .LBB99_5:
+;
+; GFX950-SDAG-LABEL: flat_inc_saddr_i64_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB99_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB99_4
+; GFX950-SDAG-NEXT: .LBB99_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB99_5
+; GFX950-SDAG-NEXT: .LBB99_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB99_2
+; GFX950-SDAG-NEXT: .LBB99_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
+; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, 0, v5, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, 0, v4, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB99_5
+; GFX950-SDAG-NEXT: .LBB99_5:
+;
+; GFX950-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB99_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB99_4
+; GFX950-GISEL-NEXT: .LBB99_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB99_5
+; GFX950-GISEL-NEXT: .LBB99_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB99_2
+; GFX950-GISEL-NEXT: .LBB99_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 1, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB99_5
+; GFX950-GISEL-NEXT: .LBB99_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -5410,6 +10561,82 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_inc_saddr_i64_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB100_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB100_4
+; GFX950-SDAG-NEXT: .LBB100_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB100_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB100_2
+; GFX950-SDAG-NEXT: .LBB100_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
+; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_inc_saddr_i64_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB100_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB100_4
+; GFX950-GISEL-NEXT: .LBB100_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB100_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB100_2
+; GFX950-GISEL-NEXT: .LBB100_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, 0, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw uinc_wrap ptr %gep0, i64 %data syncscope("agent") monotonic
@@ -5503,6 +10730,88 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_cndmask_b32_e64 v1, v3, 0, vcc_lo
; GFX1250-GISEL-NEXT: scratch_store_b64 v6, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_inc_saddr_i64_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB101_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB101_4
+; GFX950-SDAG-NEXT: .LBB101_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB101_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB101_2
+; GFX950-SDAG-NEXT: .LBB101_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, 1
+; GFX950-SDAG-NEXT: v_cmp_lt_u64_e32 vcc, v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v5, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB101_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB101_4
+; GFX950-GISEL-NEXT: .LBB101_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB101_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_inc_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB101_2
+; GFX950-GISEL-NEXT: .LBB101_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, 1, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v0, v3, 0, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e64 v1, v6, 0, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -5521,6 +10830,25 @@ define amdgpu_ps float @flat_dec_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
; GFX1250-NEXT: flat_atomic_dec_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_dec_saddr_i32_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_dec v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_dec_saddr_i32_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_dec v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw udec_wrap ptr %gep0, i32 %data syncscope("agent") monotonic
@@ -5534,6 +10862,31 @@ define amdgpu_ps float @flat_dec_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
; GFX1250-NEXT: flat_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
; GFX1250-NEXT: ; return to shader part epilog
+;
+; GFX950-SDAG-LABEL: flat_dec_saddr_i32_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: flat_atomic_dec v0, v[0:1], v2 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: ; return to shader part epilog
+;
+; GFX950-GISEL-LABEL: flat_dec_saddr_i32_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_dec v0, v[2:3], v1 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: ; return to shader part epilog
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -5547,6 +10900,23 @@ define amdgpu_ps void @flat_dec_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_dec_u32 v0, v1, s[2:3] scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_dec_saddr_i32_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_dec v[0:1], v2
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_dec_saddr_i32_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_dec v[2:3], v1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw udec_wrap ptr %gep0, i32 %data syncscope("agent") monotonic
@@ -5558,6 +10928,29 @@ define amdgpu_ps void @flat_dec_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250: ; %bb.0:
; GFX1250-NEXT: flat_atomic_dec_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
; GFX1250-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_dec_saddr_i32_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-SDAG-NEXT: flat_atomic_dec v[0:1], v2
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_dec_saddr_i32_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX950-GISEL-NEXT: flat_atomic_dec v[2:3], v1
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -5665,6 +11058,98 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1250-GISEL-NEXT: s_branch .LBB106_5
; GFX1250-GISEL-NEXT: .LBB106_5:
+;
+; GFX950-SDAG-LABEL: flat_dec_saddr_i64_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB106_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB106_4
+; GFX950-SDAG-NEXT: .LBB106_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB106_5
+; GFX950-SDAG-NEXT: .LBB106_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB106_2
+; GFX950-SDAG-NEXT: .LBB106_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB106_5
+; GFX950-SDAG-NEXT: .LBB106_5:
+;
+; GFX950-GISEL-LABEL: flat_dec_saddr_i64_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, v2, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB106_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB106_4
+; GFX950-GISEL-NEXT: .LBB106_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB106_5
+; GFX950-GISEL-NEXT: .LBB106_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB106_2
+; GFX950-GISEL-NEXT: .LBB106_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB106_5
+; GFX950-GISEL-NEXT: .LBB106_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%rtn = atomicrmw udec_wrap ptr %gep0, i64 %data syncscope("agent") monotonic
@@ -5778,6 +11263,104 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
; GFX1250-GISEL-NEXT: s_or_b32 exec_lo, exec_lo, s1
; GFX1250-GISEL-NEXT: s_branch .LBB107_5
; GFX1250-GISEL-NEXT: .LBB107_5:
+;
+; GFX950-SDAG-LABEL: flat_dec_saddr_i64_rtn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB107_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB107_4
+; GFX950-SDAG-NEXT: .LBB107_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB107_5
+; GFX950-SDAG-NEXT: .LBB107_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] sc0
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB107_2
+; GFX950-SDAG-NEXT: .LBB107_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v4, vcc
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_branch .LBB107_5
+; GFX950-SDAG-NEXT: .LBB107_5:
+;
+; GFX950-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB107_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB107_4
+; GFX950-GISEL-NEXT: .LBB107_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB107_5
+; GFX950-GISEL-NEXT: .LBB107_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[4:5] sc0
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[2:3], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB107_2
+; GFX950-GISEL-NEXT: .LBB107_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v6, -1, v2, vcc
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v2, vcc, -1, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v6, v[2:3], off
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_branch .LBB107_5
+; GFX950-GISEL-NEXT: .LBB107_5:
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -5871,6 +11454,84 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_dec_saddr_i64_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB108_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB108_4
+; GFX950-SDAG-NEXT: .LBB108_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB108_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB108_2
+; GFX950-SDAG-NEXT: .LBB108_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_dec_saddr_i64_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB108_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB108_4
+; GFX950-GISEL-NEXT: .LBB108_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB108_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB108_2
+; GFX950-GISEL-NEXT: .LBB108_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, -1, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%unused = atomicrmw udec_wrap ptr %gep0, i64 %data syncscope("agent") monotonic
@@ -5968,6 +11629,90 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
; GFX1250-GISEL-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_cndmask_b32 v1, v1, v5
; GFX1250-GISEL-NEXT: scratch_store_b64 v2, v[0:1], off scope:SCOPE_SE
; GFX1250-GISEL-NEXT: s_endpgm
+;
+; GFX950-SDAG-LABEL: flat_dec_saddr_i64_nortn_neg128:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, v1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-SDAG-NEXT: s_movk_i32 s0, 0xff80
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], s[2:3], 0, v[0:1]
+; GFX950-SDAG-NEXT: s_mov_b32 s1, -1
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-SDAG-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-SDAG-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB109_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB109_4
+; GFX950-SDAG-NEXT: .LBB109_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_endpgm
+; GFX950-SDAG-NEXT: .LBB109_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB109_2
+; GFX950-SDAG-NEXT: .LBB109_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: s_nop 1
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v6, -1, v0, vcc
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[0:1], v6, off
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX950-SDAG-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[2:3]
+; GFX950-SDAG-NEXT: v_lshl_add_u64 v[4:5], v[0:1], 0, -1
+; GFX950-SDAG-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 v6, v[0:1], off
+; GFX950-SDAG-NEXT: s_endpgm
+;
+; GFX950-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v2
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[2:3]
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v4, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff80, v0
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], src_private_base
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_ne_u32_e32 vcc, s1, v1
+; GFX950-GISEL-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX950-GISEL-NEXT: s_xor_b64 s[0:1], exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB109_3
+; GFX950-GISEL-NEXT: ; %bb.1: ; %Flow
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB109_4
+; GFX950-GISEL-NEXT: .LBB109_2: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_endpgm
+; GFX950-GISEL-NEXT: .LBB109_3: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GFX950-GISEL-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execz .LBB109_2
+; GFX950-GISEL-NEXT: .LBB109_4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v0, vcc
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[0:1], v2, off
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_add_co_u32_e32 v3, vcc, -1, v0
+; GFX950-GISEL-NEXT: s_nop 1
+; GFX950-GISEL-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc
+; GFX950-GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX950-GISEL-NEXT: v_cmp_gt_u64_e64 s[0:1], v[0:1], v[4:5]
+; GFX950-GISEL-NEXT: s_or_b64 vcc, vcc, s[0:1]
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX950-GISEL-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 v2, v[0:1], off
+; GFX950-GISEL-NEXT: s_endpgm
%zext.offset = zext i32 %voffset to i64
%gep0 = getelementptr inbounds i8, ptr %sbase, i64 %zext.offset
%gep1 = getelementptr inbounds i8, ptr %gep0, i64 -128
@@ -5975,4 +11720,2004 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
ret void
}
+define double @flat_atomic_fadd_f64_saddr_rtn(ptr inreg %ptr, double %data) {
+; GFX1250-SDAG-LABEL: flat_atomic_fadd_f64_saddr_rtn:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50
+; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s1, s3
+; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_3
+; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.check.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000
+; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB110_4
+; GFX1250-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB110_5
+; GFX1250-SDAG-NEXT: s_branch .LBB110_6
+; GFX1250-SDAG-NEXT: .LBB110_3:
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: s_branch .LBB110_7
+; GFX1250-SDAG-NEXT: .LBB110_4:
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: .LBB110_5: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_add_f64_e32 v[4:5], v[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[4:5], s2 scope:SCOPE_SE
+; GFX1250-SDAG-NEXT: .LBB110_6: ; %Flow1
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB110_8
+; GFX1250-SDAG-NEXT: .LBB110_7: ; %atomicrmw.shared
+; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1]
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: .LBB110_8: ; %atomicrmw.end
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: flat_atomic_fadd_f64_saddr_rtn:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s0, 0x50
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_6
+; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1
+; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB110_3
+; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1250-GISEL-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-GISEL-NEXT: .LBB110_3: ; %Flow
+; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB110_5
+; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_f64_e32 v[4:5], v[2:3], v[0:1]
+; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[4:5], s2 scope:SCOPE_SE
+; GFX1250-GISEL-NEXT: .LBB110_5: ; %Flow1
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1250-GISEL-NEXT: .LBB110_6: ; %Flow2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB110_8
+; GFX1250-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared
+; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1]
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: .LBB110_8: ; %atomicrmw.end
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fadd_f64_saddr_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50
+; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3
+; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB110_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %atomicrmw.check.private
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3
+; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB110_4
+; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-SDAG-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB110_5
+; GFX950-SDAG-NEXT: s_branch .LBB110_6
+; GFX950-SDAG-NEXT: .LBB110_3:
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: s_branch .LBB110_7
+; GFX950-SDAG-NEXT: .LBB110_4:
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: .LBB110_5: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s2, s0, -1
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[4:5], s2
+; GFX950-SDAG-NEXT: .LBB110_6: ; %Flow1
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB110_8
+; GFX950-SDAG-NEXT: .LBB110_7: ; %atomicrmw.shared
+; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-SDAG-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: .LBB110_8: ; %atomicrmw.end
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v3
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fadd_f64_saddr_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_add_u32 s0, s0, 0x50
+; GFX950-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3
+; GFX950-GISEL-NEXT: s_mov_b32 s2, 1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB110_6
+; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private
+; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3
+; GFX950-GISEL-NEXT: s_mov_b32 s2, 1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB110_3
+; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-GISEL-NEXT: global_atomic_add_f64 v[2:3], v2, v[0:1], s[0:1] sc0
+; GFX950-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX950-GISEL-NEXT: .LBB110_3: ; %Flow
+; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB110_5
+; GFX950-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s2, s0, -1
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s2
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_add_f64 v[4:5], v[2:3], v[0:1]
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[4:5], s2
+; GFX950-GISEL-NEXT: .LBB110_5: ; %Flow1
+; GFX950-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX950-GISEL-NEXT: .LBB110_6: ; %Flow2
+; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB110_8
+; GFX950-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared
+; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s0, s0, -1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-GISEL-NEXT: ds_add_rtn_f64 v[2:3], v2, v[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: .LBB110_8: ; %atomicrmw.end
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v3
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
+ %result = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret double %result
+}
+
+define void @flat_atomic_fadd_f64_saddr_nortn(ptr inreg %ptr, double %data) {
+; GFX1250-SDAG-LABEL: flat_atomic_fadd_f64_saddr_nortn:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50
+; GFX1250-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_cmp_eq_u32 s1, s3
+; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_3
+; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow2
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB111_8
+; GFX1250-SDAG-NEXT: .LBB111_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-SDAG-NEXT: .LBB111_3: ; %atomicrmw.check.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000
+; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB111_5
+; GFX1250-SDAG-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0
+; GFX1250-SDAG-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX1250-SDAG-NEXT: .LBB111_5: ; %Flow
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB111_7
+; GFX1250-SDAG-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s2, s2, -1
+; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s2
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_add_f64_e32 v[2:3], v[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[2:3], s2 scope:SCOPE_SE
+; GFX1250-SDAG-NEXT: .LBB111_7: ; %Flow1
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB111_2
+; GFX1250-SDAG-NEXT: .LBB111_8: ; %atomicrmw.shared
+; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s0, -1
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-SDAG-NEXT: ds_add_f64 v2, v[0:1]
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: flat_atomic_fadd_f64_saddr_nortn:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s0, s0, 0x50
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s1, s3
+; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_6
+; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s2, 0x4000000
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1
+; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB111_3
+; GFX1250-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1250-GISEL-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: .LBB111_3: ; %Flow
+; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB111_5
+; GFX1250-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-GISEL-NEXT: s_cselect_b32 s2, s2, -1
+; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s2
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_add_f64_e32 v[2:3], v[2:3], v[0:1]
+; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[2:3], s2 scope:SCOPE_SE
+; GFX1250-GISEL-NEXT: .LBB111_5: ; %Flow1
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX1250-GISEL-NEXT: .LBB111_6: ; %Flow2
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_xor_b32 s2, s2, 1
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB111_8
+; GFX1250-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared
+; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX1250-GISEL-NEXT: ds_add_f64 v2, v[0:1]
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: .LBB111_8: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fadd_f64_saddr_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50
+; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_shared_base
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3
+; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1
+; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB111_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow2
+; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB111_8
+; GFX950-SDAG-NEXT: .LBB111_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX950-SDAG-NEXT: .LBB111_3: ; %atomicrmw.check.private
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3
+; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1
+; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB111_5
+; GFX950-SDAG-NEXT: ; %bb.4: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-SDAG-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-SDAG-NEXT: .LBB111_5: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB111_7
+; GFX950-SDAG-NEXT: ; %bb.6: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s2, s0, -1
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s2
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1]
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[2:3], s2
+; GFX950-SDAG-NEXT: .LBB111_7: ; %Flow1
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB111_2
+; GFX950-SDAG-NEXT: .LBB111_8: ; %atomicrmw.shared
+; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-SDAG-NEXT: ds_add_f64 v2, v[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fadd_f64_saddr_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_add_u32 s0, s0, 0x50
+; GFX950-GISEL-NEXT: s_addc_u32 s1, s1, 0
+; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_shared_base
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3
+; GFX950-GISEL-NEXT: s_mov_b32 s2, 1
+; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB111_6
+; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.check.private
+; GFX950-GISEL-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s1, s3
+; GFX950-GISEL-NEXT: s_mov_b32 s2, 1
+; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB111_3
+; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX950-GISEL-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX950-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX950-GISEL-NEXT: .LBB111_3: ; %Flow
+; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB111_5
+; GFX950-GISEL-NEXT: ; %bb.4: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s2, s0, -1
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s2
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_add_f64 v[2:3], v[2:3], v[0:1]
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[2:3], s2
+; GFX950-GISEL-NEXT: .LBB111_5: ; %Flow1
+; GFX950-GISEL-NEXT: s_mov_b32 s2, 0
+; GFX950-GISEL-NEXT: .LBB111_6: ; %Flow2
+; GFX950-GISEL-NEXT: s_xor_b32 s2, s2, 1
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s2, 0
+; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB111_8
+; GFX950-GISEL-NEXT: ; %bb.7: ; %atomicrmw.shared
+; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s0, s0, -1
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-GISEL-NEXT: ds_add_f64 v2, v[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: .LBB111_8: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
+ %unused = atomicrmw fadd ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define double @flat_atomic_fmax_f64_saddr_rtn(ptr inreg %ptr, double %data) {
+; GFX1250-SDAG-LABEL: flat_atomic_fmax_f64_saddr_rtn:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000
+; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB112_2
+; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB112_3
+; GFX1250-SDAG-NEXT: s_branch .LBB112_4
+; GFX1250-SDAG-NEXT: .LBB112_2:
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1]
+; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE
+; GFX1250-SDAG-NEXT: .LBB112_4: ; %atomicrmw.end
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: flat_atomic_fmax_f64_saddr_rtn:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4
+; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1
+; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB112_2
+; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: .LBB112_2: ; %Flow
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB112_4
+; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0
+; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[4:5], v[0:1]
+; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE
+; GFX1250-GISEL-NEXT: .LBB112_4: ; %atomicrmw.end
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fmax_f64_saddr_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50
+; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3
+; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB112_2
+; GFX950-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB112_3
+; GFX950-SDAG-NEXT: s_branch .LBB112_4
+; GFX950-SDAG-NEXT: .LBB112_2:
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: .LBB112_3: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1]
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-SDAG-NEXT: .LBB112_4: ; %atomicrmw.end
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v3
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fmax_f64_saddr_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50
+; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0
+; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5
+; GFX950-GISEL-NEXT: s_mov_b32 s4, 1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB112_2
+; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: flat_atomic_max_f64 v[2:3], v[2:3], v[0:1] offset:80 sc0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX950-GISEL-NEXT: .LBB112_2: ; %Flow
+; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB112_4
+; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[4:5], v[0:1]
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-GISEL-NEXT: .LBB112_4: ; %atomicrmw.end
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v3
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
+ %result = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret double %result
+}
+
+define void @flat_atomic_fmax_f64_saddr_nortn(ptr inreg %ptr, double %data) {
+; GFX1250-SDAG-LABEL: flat_atomic_fmax_f64_saddr_nortn:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000
+; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB113_3
+; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB113_4
+; GFX1250-SDAG-NEXT: .LBB113_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-SDAG-NEXT: .LBB113_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1]
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB113_2
+; GFX1250-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1
+; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: flat_atomic_fmax_f64_saddr_nortn:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4
+; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1
+; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB113_2
+; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1250-GISEL-NEXT: flat_atomic_max_num_f64 v2, v[0:1], s[0:1] offset:80
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: .LBB113_2: ; %Flow
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB113_4
+; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0
+; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1
+; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE
+; GFX1250-GISEL-NEXT: .LBB113_4: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fmax_f64_saddr_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50
+; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3
+; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1
+; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB113_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB113_4
+; GFX950-SDAG-NEXT: .LBB113_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX950-SDAG-NEXT: .LBB113_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_max_f64 v[2:3], v[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB113_2
+; GFX950-SDAG-NEXT: .LBB113_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fmax_f64_saddr_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50
+; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0
+; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5
+; GFX950-GISEL-NEXT: s_mov_b32 s4, 1
+; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB113_2
+; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: flat_atomic_max_f64 v[2:3], v[0:1] offset:80
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX950-GISEL-NEXT: .LBB113_2: ; %Flow
+; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB113_4
+; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[2:3], v[0:1]
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-GISEL-NEXT: .LBB113_4: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
+ %unused = atomicrmw fmax ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define double @flat_atomic_fmin_f64_saddr_rtn(ptr inreg %ptr, double %data) {
+; GFX1250-SDAG-LABEL: flat_atomic_fmin_f64_saddr_rtn:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000
+; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB114_2
+; GFX1250-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] th:TH_ATOMIC_RETURN
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_cbranch_execz .LBB114_3
+; GFX1250-SDAG-NEXT: s_branch .LBB114_4
+; GFX1250-SDAG-NEXT: .LBB114_2:
+; GFX1250-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1]
+; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE
+; GFX1250-SDAG-NEXT: .LBB114_4: ; %atomicrmw.end
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: flat_atomic_fmin_f64_saddr_rtn:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4
+; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1
+; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB114_2
+; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v[2:3], v2, v[0:1], s[0:1] offset:80 th:TH_ATOMIC_RETURN
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: .LBB114_2: ; %Flow
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB114_4
+; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0
+; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[4:5], v[2:3], v[2:3]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_min_num_f64_e32 v[0:1], v[4:5], v[0:1]
+; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE
+; GFX1250-GISEL-NEXT: .LBB114_4: ; %atomicrmw.end
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fmin_f64_saddr_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50
+; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3
+; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB114_2
+; GFX950-SDAG-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] sc0
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_cbranch_execz .LBB114_3
+; GFX950-SDAG-NEXT: s_branch .LBB114_4
+; GFX950-SDAG-NEXT: .LBB114_2:
+; GFX950-SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-SDAG-NEXT: .LBB114_3: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX950-SDAG-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1]
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-SDAG-NEXT: .LBB114_4: ; %atomicrmw.end
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v0, v2
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v3
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fmin_f64_saddr_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50
+; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0
+; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5
+; GFX950-GISEL-NEXT: s_mov_b32 s4, 1
+; GFX950-GISEL-NEXT: ; implicit-def: $vgpr2_vgpr3
+; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB114_2
+; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: flat_atomic_min_f64 v[2:3], v[2:3], v[0:1] offset:80 sc0
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX950-GISEL-NEXT: .LBB114_2: ; %Flow
+; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB114_4
+; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_max_f64 v[4:5], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT: v_min_f64 v[0:1], v[4:5], v[0:1]
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-GISEL-NEXT: .LBB114_4: ; %atomicrmw.end
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v0, v2
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v3
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
+ %result = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret double %result
+}
+
+define void @flat_atomic_fmin_f64_saddr_nortn(ptr inreg %ptr, double %data) {
+; GFX1250-SDAG-LABEL: flat_atomic_fmin_f64_saddr_nortn:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 0x50
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_hi
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_xor_b32 s2, s1, s2
+; GFX1250-SDAG-NEXT: s_cmp_lt_u32 s2, 0x4000000
+; GFX1250-SDAG-NEXT: s_cselect_b32 s2, -1, 0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, -1
+; GFX1250-SDAG-NEXT: s_cbranch_vccnz .LBB115_3
+; GFX1250-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX1250-SDAG-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s2
+; GFX1250-SDAG-NEXT: s_cbranch_vccz .LBB115_4
+; GFX1250-SDAG-NEXT: .LBB115_2: ; %atomicrmw.phi
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+; GFX1250-SDAG-NEXT: .LBB115_3: ; %atomicrmw.global
+; GFX1250-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-SDAG-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1]
+; GFX1250-SDAG-NEXT: s_wait_dscnt 0x0
+; GFX1250-SDAG-NEXT: s_cbranch_execnz .LBB115_2
+; GFX1250-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, src_flat_scratch_base_lo
+; GFX1250-SDAG-NEXT: s_wait_xcnt 0x0
+; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX1250-SDAG-NEXT: s_sub_co_i32 s2, s0, s2
+; GFX1250-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX1250-SDAG-NEXT: s_cselect_b32 s0, s2, -1
+; GFX1250-SDAG-NEXT: scratch_load_b64 v[2:3], off, s0
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX1250-SDAG-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE
+; GFX1250-SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-GISEL-LABEL: flat_atomic_fmin_f64_saddr_nortn:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_add_co_u32 s2, s0, 0x50
+; GFX1250-GISEL-NEXT: s_add_co_ci_u32 s3, s1, 0
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, src_flat_scratch_base_hi
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_xor_b32 s4, s3, s4
+; GFX1250-GISEL-NEXT: s_cmp_ge_u32 s4, 0x4000000
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, 1
+; GFX1250-GISEL-NEXT: s_cbranch_scc0 .LBB115_2
+; GFX1250-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX1250-GISEL-NEXT: flat_atomic_min_num_f64 v2, v[0:1], s[0:1] offset:80
+; GFX1250-GISEL-NEXT: s_wait_dscnt 0x0
+; GFX1250-GISEL-NEXT: .LBB115_2: ; %Flow
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_xor_b32 s0, s4, 1
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX1250-GISEL-NEXT: s_cbranch_scc1 .LBB115_4
+; GFX1250-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX1250-GISEL-NEXT: s_mov_b32 s0, src_flat_scratch_base_lo
+; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
+; GFX1250-GISEL-NEXT: s_sub_co_i32 s0, s2, s0
+; GFX1250-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX1250-GISEL-NEXT: s_cselect_b32 s0, s0, -1
+; GFX1250-GISEL-NEXT: scratch_load_b64 v[2:3], off, s0
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_max_num_f64_e32 v[2:3], v[2:3], v[2:3]
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_min_num_f64_e32 v[0:1], v[2:3], v[0:1]
+; GFX1250-GISEL-NEXT: scratch_store_b64 off, v[0:1], s0 scope:SCOPE_SE
+; GFX1250-GISEL-NEXT: .LBB115_4: ; %atomicrmw.phi
+; GFX1250-GISEL-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fmin_f64_saddr_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_add_u32 s0, s0, 0x50
+; GFX950-SDAG-NEXT: s_addc_u32 s1, s1, 0
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], src_private_base
+; GFX950-SDAG-NEXT: s_cmp_eq_u32 s1, s3
+; GFX950-SDAG-NEXT: s_cselect_b64 s[2:3], -1, 0
+; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], -1
+; GFX950-SDAG-NEXT: s_cbranch_vccnz .LBB115_3
+; GFX950-SDAG-NEXT: ; %bb.1: ; %Flow
+; GFX950-SDAG-NEXT: s_andn2_b64 vcc, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_vccz .LBB115_4
+; GFX950-SDAG-NEXT: .LBB115_2: ; %atomicrmw.phi
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+; GFX950-SDAG-NEXT: .LBB115_3: ; %atomicrmw.global
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: flat_atomic_min_f64 v[2:3], v[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB115_2
+; GFX950-SDAG-NEXT: .LBB115_4: ; %atomicrmw.private
+; GFX950-SDAG-NEXT: s_cmp_lg_u64 s[0:1], 0
+; GFX950-SDAG-NEXT: s_cselect_b32 s0, s0, -1
+; GFX950-SDAG-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-SDAG-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-SDAG-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX950-SDAG-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0)
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fmin_f64_saddr_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_add_u32 s2, s0, 0x50
+; GFX950-GISEL-NEXT: s_addc_u32 s3, s1, 0
+; GFX950-GISEL-NEXT: s_mov_b64 s[4:5], src_private_base
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s3, s5
+; GFX950-GISEL-NEXT: s_mov_b32 s4, 1
+; GFX950-GISEL-NEXT: s_cbranch_scc0 .LBB115_2
+; GFX950-GISEL-NEXT: ; %bb.1: ; %atomicrmw.global
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: flat_atomic_min_f64 v[2:3], v[0:1] offset:80
+; GFX950-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_mov_b32 s4, 0
+; GFX950-GISEL-NEXT: .LBB115_2: ; %Flow
+; GFX950-GISEL-NEXT: s_xor_b32 s0, s4, 1
+; GFX950-GISEL-NEXT: s_cmp_lg_u32 s0, 0
+; GFX950-GISEL-NEXT: s_cbranch_scc1 .LBB115_4
+; GFX950-GISEL-NEXT: ; %bb.3: ; %atomicrmw.private
+; GFX950-GISEL-NEXT: s_cmp_lg_u64 s[2:3], 0
+; GFX950-GISEL-NEXT: s_cselect_b32 s0, s2, -1
+; GFX950-GISEL-NEXT: scratch_load_dwordx2 v[2:3], off, s0
+; GFX950-GISEL-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX950-GISEL-NEXT: v_min_f64 v[0:1], v[2:3], v[0:1]
+; GFX950-GISEL-NEXT: scratch_store_dwordx2 off, v[0:1], s0
+; GFX950-GISEL-NEXT: .LBB115_4: ; %atomicrmw.phi
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0)
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x double], ptr %ptr, i64 0, i64 10
+ %unused = atomicrmw fmin ptr %gep.0, double %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define float @flat_atomic_fadd_f32_saddr_rtn(ptr inreg %ptr, float %data) {
+; GFX1250-LABEL: flat_atomic_fadd_f32_saddr_rtn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: flat_atomic_add_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fadd_f32_saddr_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-SDAG-NEXT: flat_atomic_add_f32 v0, v[2:3], v0 offset:40 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fadd_f32_saddr_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: flat_atomic_add_f32 v0, v[2:3], v0 offset:40 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
+ %result = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret float %result
+}
+
+define void @flat_atomic_fadd_f32_saddr_nortn(ptr inreg %ptr, float %data) {
+; GFX1250-LABEL: flat_atomic_fadd_f32_saddr_nortn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: flat_atomic_add_f32 v1, v0, s[0:1] offset:40
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fadd_f32_saddr_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-SDAG-NEXT: flat_atomic_add_f32 v[2:3], v0 offset:40
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fadd_f32_saddr_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: flat_atomic_add_f32 v[2:3], v0 offset:40
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
+ %unused = atomicrmw fadd ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define float @flat_atomic_fmax_f32_saddr_rtn(ptr inreg %ptr, float %data) {
+; GFX1250-LABEL: flat_atomic_fmax_f32_saddr_rtn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: flat_atomic_max_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fmax_f32_saddr_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT: .LBB118_1: ; %atomicrmw.start
+; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0
+; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v1
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB118_1
+; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fmax_f32_saddr_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX950-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT: .LBB118_1: ; %atomicrmw.start
+; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0
+; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX950-GISEL-NEXT: v_max_f32_e32 v4, v0, v1
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB118_1
+; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
+ %result = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret float %result
+}
+
+define void @flat_atomic_fmax_f32_saddr_nortn(ptr inreg %ptr, float %data) {
+; GFX1250-LABEL: flat_atomic_fmax_f32_saddr_nortn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: flat_atomic_max_num_f32 v1, v0, s[0:1] offset:40
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fmax_f32_saddr_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: .LBB119_1: ; %atomicrmw.start
+; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v2, v4
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB119_1
+; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fmax_f32_saddr_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX950-GISEL-NEXT: v_max_f32_e32 v4, v0, v0
+; GFX950-GISEL-NEXT: .LBB119_1: ; %atomicrmw.start
+; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB119_1
+; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
+ %unused = atomicrmw fmax ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define float @flat_atomic_fmin_f32_saddr_rtn(ptr inreg %ptr, float %data) {
+; GFX1250-LABEL: flat_atomic_fmin_f32_saddr_rtn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: flat_atomic_min_num_f32 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fmin_f32_saddr_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-SDAG-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX950-SDAG-NEXT: .LBB120_1: ; %atomicrmw.start
+; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0
+; GFX950-SDAG-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX950-SDAG-NEXT: v_min_f32_e32 v4, v0, v1
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB120_1
+; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fmin_f32_saddr_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX950-GISEL-NEXT: v_max_f32_e32 v1, v1, v1
+; GFX950-GISEL-NEXT: .LBB120_1: ; %atomicrmw.start
+; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0
+; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v5, v5
+; GFX950-GISEL-NEXT: v_min_f32_e32 v4, v0, v1
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB120_1
+; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
+ %result = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret float %result
+}
+
+define void @flat_atomic_fmin_f32_saddr_nortn(ptr inreg %ptr, float %data) {
+; GFX1250-LABEL: flat_atomic_fmin_f32_saddr_nortn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: flat_atomic_min_num_f32 v1, v0, s[0:1] offset:40
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fmin_f32_saddr_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-SDAG-NEXT: v_max_f32_e32 v4, v0, v0
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: .LBB121_1: ; %atomicrmw.start
+; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_max_f32_e32 v2, v3, v3
+; GFX950-SDAG-NEXT: v_min_f32_e32 v2, v2, v4
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB121_1
+; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fmin_f32_saddr_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX950-GISEL-NEXT: v_max_f32_e32 v4, v0, v0
+; GFX950-GISEL-NEXT: .LBB121_1: ; %atomicrmw.start
+; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_max_f32_e32 v0, v1, v1
+; GFX950-GISEL-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB121_1
+; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x float], ptr %ptr, i64 0, i64 10
+ %unused = atomicrmw fmin ptr %gep.0, float %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define <2 x half> @flat_atomic_fadd_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) {
+; GFX1250-LABEL: flat_atomic_fadd_v2f16_saddr_rtn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: flat_atomic_pk_add_f16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fadd_v2f16_saddr_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-SDAG-NEXT: flat_atomic_pk_add_f16 v0, v[2:3], v0 offset:40 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fadd_v2f16_saddr_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: flat_atomic_pk_add_f16 v0, v[2:3], v0 offset:40 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
+ %result = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret <2 x half> %result
+}
+
+define void @flat_atomic_fadd_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data) {
+; GFX1250-LABEL: flat_atomic_fadd_v2f16_saddr_nortn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: flat_atomic_pk_add_f16 v1, v0, s[0:1] offset:40
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fadd_v2f16_saddr_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-SDAG-NEXT: flat_atomic_pk_add_f16 v[2:3], v0 offset:40
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fadd_v2f16_saddr_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: flat_atomic_pk_add_f16 v[2:3], v0 offset:40
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
+ %unused = atomicrmw fadd ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define <2 x half> @flat_atomic_fmax_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) {
+; GFX1250-LABEL: flat_atomic_fmax_v2f16_saddr_rtn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, 0
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_num_f16 v1, v1, v1
+; GFX1250-NEXT: flat_load_b32 v0, v2, s[0:1] offset:40
+; GFX1250-NEXT: .LBB124_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v5, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5
+; GFX1250-NEXT: v_pk_max_num_f16 v4, v0, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_cbranch_execnz .LBB124_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fmax_v2f16_saddr_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX950-SDAG-NEXT: .LBB124_1: ; %atomicrmw.start
+; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0
+; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v5, v5
+; GFX950-SDAG-NEXT: s_nop 0
+; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v1
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB124_1
+; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fmax_v2f16_saddr_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX950-GISEL-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT: .LBB124_1: ; %atomicrmw.start
+; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0
+; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v5, v5
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v0, v1
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB124_1
+; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
+ %result = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret <2 x half> %result
+}
+
+define void @flat_atomic_fmax_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data) {
+; GFX1250-LABEL: flat_atomic_fmax_v2f16_saddr_nortn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: v_pk_max_num_f16 v3, v0, v0
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40
+; GFX1250-NEXT: .LBB125_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_num_f16 v0, v0, v3
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_cbranch_execnz .LBB125_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fmax_v2f16_saddr_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: .LBB125_1: ; %atomicrmw.start
+; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX950-SDAG-NEXT: s_nop 0
+; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v2, v4
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB125_1
+; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fmax_v2f16_saddr_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v0, v0
+; GFX950-GISEL-NEXT: .LBB125_1: ; %atomicrmw.start
+; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v0, v4
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB125_1
+; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
+ %unused = atomicrmw fmax ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define <2 x half> @flat_atomic_fmin_v2f16_saddr_rtn(ptr inreg %ptr, <2 x half> %data) {
+; GFX1250-LABEL: flat_atomic_fmin_v2f16_saddr_rtn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v2, 0
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_num_f16 v1, v1, v1
+; GFX1250-NEXT: flat_load_b32 v0, v2, s[0:1] offset:40
+; GFX1250-NEXT: .LBB126_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v5, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_num_f16 v0, v5, v5
+; GFX1250-NEXT: v_pk_min_num_f16 v4, v0, v1
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v5
+; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_cbranch_execnz .LBB126_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fmin_v2f16_saddr_rtn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-SDAG-NEXT: flat_load_dword v0, v[2:3] offset:40
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-SDAG-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX950-SDAG-NEXT: .LBB126_1: ; %atomicrmw.start
+; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v5, v0
+; GFX950-SDAG-NEXT: v_pk_max_f16 v0, v5, v5
+; GFX950-SDAG-NEXT: s_nop 0
+; GFX950-SDAG-NEXT: v_pk_min_f16 v4, v0, v1
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB126_1
+; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fmin_v2f16_saddr_rtn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-GISEL-NEXT: flat_load_dword v0, v[2:3] offset:40
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX950-GISEL-NEXT: v_pk_max_f16 v1, v1, v1
+; GFX950-GISEL-NEXT: .LBB126_1: ; %atomicrmw.start
+; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v5, v0
+; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v5, v5
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_pk_min_f16 v4, v0, v1
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[4:5] offset:40 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v5
+; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB126_1
+; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
+ %result = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret <2 x half> %result
+}
+
+define void @flat_atomic_fmin_v2f16_saddr_nortn(ptr inreg %ptr, <2 x half> %data) {
+; GFX1250-LABEL: flat_atomic_fmin_v2f16_saddr_nortn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: v_pk_max_num_f16 v3, v0, v0
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40
+; GFX1250-NEXT: .LBB127_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_pk_max_num_f16 v0, v1, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_min_num_f16 v0, v0, v3
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:40 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v0, v1
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_cbranch_execnz .LBB127_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-SDAG-LABEL: flat_atomic_fmin_v2f16_saddr_nortn:
+; GFX950-SDAG: ; %bb.0:
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-SDAG-NEXT: flat_load_dword v3, v[2:3] offset:40
+; GFX950-SDAG-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-SDAG-NEXT: v_pk_max_f16 v4, v0, v0
+; GFX950-SDAG-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GFX950-SDAG-NEXT: .LBB127_1: ; %atomicrmw.start
+; GFX950-SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_pk_max_f16 v2, v3, v3
+; GFX950-SDAG-NEXT: s_nop 0
+; GFX950-SDAG-NEXT: v_pk_min_f16 v2, v2, v4
+; GFX950-SDAG-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] offset:40 sc0
+; GFX950-SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX950-SDAG-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-SDAG-NEXT: v_mov_b32_e32 v3, v2
+; GFX950-SDAG-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_cbranch_execnz .LBB127_1
+; GFX950-SDAG-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-SDAG-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX950-GISEL-LABEL: flat_atomic_fmin_v2f16_saddr_nortn:
+; GFX950-GISEL: ; %bb.0:
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-GISEL-NEXT: flat_load_dword v1, v[2:3] offset:40
+; GFX950-GISEL-NEXT: s_mov_b64 s[0:1], 0
+; GFX950-GISEL-NEXT: v_pk_max_f16 v4, v0, v0
+; GFX950-GISEL-NEXT: .LBB127_1: ; %atomicrmw.start
+; GFX950-GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_pk_max_f16 v0, v1, v1
+; GFX950-GISEL-NEXT: s_nop 0
+; GFX950-GISEL-NEXT: v_pk_min_f16 v0, v0, v4
+; GFX950-GISEL-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
+; GFX950-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-GISEL-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX950-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-GISEL-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_cbranch_execnz .LBB127_1
+; GFX950-GISEL-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-GISEL-NEXT: s_or_b64 exec, exec, s[0:1]
+; GFX950-GISEL-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x <2 x half>], ptr %ptr, i64 0, i64 10
+ %unused = atomicrmw fmin ptr %gep.0, <2 x half> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define <2 x bfloat> @flat_atomic_fadd_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bfloat> %data) {
+; GFX1250-LABEL: flat_atomic_fadd_v2bf16_saddr_rtn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: flat_atomic_pk_add_bf16 v0, v1, v0, s[0:1] offset:40 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_rtn:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-NEXT: flat_atomic_pk_add_bf16 v0, v[2:3], v0 offset:40 sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
+ %result = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret <2 x bfloat> %result
+}
+
+define void @flat_atomic_fadd_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %data) {
+; GFX1250-LABEL: flat_atomic_fadd_v2bf16_saddr_nortn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: flat_atomic_pk_add_bf16 v1, v0, s[0:1] offset:40
+; GFX1250-NEXT: s_wait_dscnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-LABEL: flat_atomic_fadd_v2bf16_saddr_nortn:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v2, s0
+; GFX950-NEXT: v_mov_b32_e32 v3, s1
+; GFX950-NEXT: flat_atomic_pk_add_bf16 v[2:3], v0 offset:40
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
+ %unused = atomicrmw fadd ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define <2 x bfloat> @flat_atomic_fmax_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bfloat> %data) {
+; GFX1250-LABEL: flat_atomic_fmax_v2bf16_saddr_rtn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40
+; GFX1250-NEXT: .LBB130_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v5, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_max_num_bf16 v4, v5, v0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_cbranch_execnz .LBB130_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: v_mov_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-LABEL: flat_atomic_fmax_v2bf16_saddr_rtn:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-NEXT: flat_load_dword v0, v[2:3] offset:40
+; GFX950-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: .LBB130_1: ; %atomicrmw.start
+; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v7, v0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX950-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX950-NEXT: v_max_f32_e32 v5, v5, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v5, v0
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[6:7] offset:40 sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX950-NEXT: s_cbranch_execnz .LBB130_1
+; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
+ %result = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret <2 x bfloat> %result
+}
+
+define void @flat_atomic_fmax_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %data) {
+; GFX1250-LABEL: flat_atomic_fmax_v2bf16_saddr_nortn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: flat_load_b32 v3, v1, s[0:1] offset:40
+; GFX1250-NEXT: .LBB131_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_pk_max_num_bf16 v2, v3, v0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v3, v2
+; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_cbranch_execnz .LBB131_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-LABEL: flat_atomic_fmax_v2bf16_saddr_nortn:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: flat_load_dword v1, v[2:3] offset:40
+; GFX950-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: .LBB131_1: ; %atomicrmw.start
+; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: v_max_f32_e32 v0, v0, v4
+; GFX950-NEXT: v_max_f32_e32 v6, v6, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX950-NEXT: s_cbranch_execnz .LBB131_1
+; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
+ %unused = atomicrmw fmax ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
+define <2 x bfloat> @flat_atomic_fmin_v2bf16_saddr_rtn(ptr inreg %ptr, <2 x bfloat> %data) {
+; GFX1250-LABEL: flat_atomic_fmin_v2bf16_saddr_rtn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: flat_load_b32 v1, v2, s[0:1] offset:40
+; GFX1250-NEXT: .LBB132_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v5, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_pk_min_num_bf16 v4, v5, v0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v1, v2, v[4:5], s[0:1] offset:40 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v1, v5
+; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_cbranch_execnz .LBB132_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: v_mov_b32_e32 v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-LABEL: flat_atomic_fmin_v2bf16_saddr_rtn:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-NEXT: flat_load_dword v0, v[2:3] offset:40
+; GFX950-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX950-NEXT: .LBB132_1: ; %atomicrmw.start
+; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v7, v0
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v7
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v7
+; GFX950-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX950-NEXT: v_min_f32_e32 v5, v5, v1
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v6, v5, v0
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[6:7] offset:40 sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v7
+; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX950-NEXT: s_cbranch_execnz .LBB132_1
+; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
+ %result = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret <2 x bfloat> %result
+}
+
+define void @flat_atomic_fmin_v2bf16_saddr_nortn(ptr inreg %ptr, <2 x bfloat> %data) {
+; GFX1250-LABEL: flat_atomic_fmin_v2bf16_saddr_nortn:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_mov_b32 s2, 0
+; GFX1250-NEXT: flat_load_b32 v3, v1, s[0:1] offset:40
+; GFX1250-NEXT: .LBB133_1: ; %atomicrmw.start
+; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_pk_min_num_bf16 v2, v3, v0
+; GFX1250-NEXT: flat_atomic_cmpswap_b32 v2, v1, v[2:3], s[0:1] offset:40 th:TH_ATOMIC_RETURN
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: v_cmp_eq_u32_e32 vcc_lo, v2, v3
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v3, v2
+; GFX1250-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_cbranch_execnz .LBB133_1
+; GFX1250-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1250-NEXT: s_or_b32 exec_lo, exec_lo, s2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX950-LABEL: flat_atomic_fmin_v2bf16_saddr_nortn:
+; GFX950: ; %bb.0:
+; GFX950-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_mov_b64_e32 v[2:3], s[0:1]
+; GFX950-NEXT: flat_load_dword v1, v[2:3] offset:40
+; GFX950-NEXT: s_mov_b64 s[2:3], 0
+; GFX950-NEXT: v_and_b32_e32 v4, 0xffff0000, v0
+; GFX950-NEXT: v_lshlrev_b32_e32 v5, 16, v0
+; GFX950-NEXT: .LBB133_1: ; %atomicrmw.start
+; GFX950-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_and_b32_e32 v0, 0xffff0000, v1
+; GFX950-NEXT: v_lshlrev_b32_e32 v6, 16, v1
+; GFX950-NEXT: v_min_f32_e32 v0, v0, v4
+; GFX950-NEXT: v_min_f32_e32 v6, v6, v5
+; GFX950-NEXT: v_cvt_pk_bf16_f32 v0, v6, v0
+; GFX950-NEXT: flat_atomic_cmpswap v0, v[2:3], v[0:1] offset:40 sc0
+; GFX950-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX950-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX950-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX950-NEXT: v_mov_b32_e32 v1, v0
+; GFX950-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX950-NEXT: s_cbranch_execnz .LBB133_1
+; GFX950-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX950-NEXT: s_or_b64 exec, exec, s[2:3]
+; GFX950-NEXT: s_setpc_b64 s[30:31]
+ %gep.0 = getelementptr inbounds [512 x <2 x bfloat>], ptr %ptr, i64 0, i64 10
+ %unused = atomicrmw fmin ptr %gep.0, <2 x bfloat> %data syncscope("workgroup") seq_cst, align 8, !amdgpu.no.fine.grained.memory !0
+ ret void
+}
+
attributes #0 = { argmemonly nounwind willreturn }
+
+!0 = !{}
More information about the llvm-commits
mailing list