[llvm] [AMDGPU] Remove the AnnotateKernelFeatures pass (PR #130198)
Jun Wang via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 6 14:48:44 PST 2025
https://github.com/jwanggit86 created https://github.com/llvm/llvm-project/pull/130198
Previously the AnnotateKernelFeatures pass infers two attributes: amdgpu-calls and amdgpu-stack-objects, which are used to help determine if flat scratch init is allowed. PR #118907 created the amdgpu-no-flat-scratch-init attribute. Continuing with that work, this patch makes use of this attribute to determine flat scratch init, replacing amdgpu-calls and amdgpu-stack-objects. This also leads to the removal of the AnnotateKernelFeatures pass.
>From 84f99ed722b151e8872482f14dafe9d4f2886a35 Mon Sep 17 00:00:00 2001
From: Jun Wang <jwang86 at yahoo.com>
Date: Thu, 6 Mar 2025 14:36:00 -0800
Subject: [PATCH] [AMDGPU] Remove the AnnotateKernelFeatures pass
Previously the AnnotateKernelFeatures pass infers two attributes:
amdgpu-calls and amdgpu-stack-objects, which are used to help determine
if flat scratch init is allowed. PR #118907 created the
amdgpu-no-flat-scratch-init attribute. Continuing with that work, this
patch makes use of this attribute to determine flat scratch init,
replacing amdgpu-calls and amdgpu-stack-objects. This also leads to the
removal of the AnnotateKernelFeatures pass.
---
llvm/lib/Target/AMDGPU/AMDGPU.h | 3 -
.../AMDGPU/AMDGPUAnnotateKernelFeatures.cpp | 9 -
.../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 -
llvm/lib/Target/AMDGPU/GCNSubtarget.cpp | 16 +-
.../AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll | 368 ++++-
.../AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll | 390 ++++-
.../AMDGPU/GlobalISel/extractelement.ll | 71 +-
.../AMDGPU/GlobalISel/flat-scratch-init.ll | 4 +-
...licit-kernarg-backend-usage-global-isel.ll | 36 +-
.../GlobalISel/insertelement-stack-lower.ll | 2 +-
.../AMDGPU/GlobalISel/lds-global-value.ll | 5 +-
.../GlobalISel/llvm.amdgcn.if.break.i64.ll | 3 +
.../GlobalISel/llvm.amdgcn.trig.preop.ll | 24 +
.../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 33 +
.../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll | 33 +
.../abi-attribute-hints-undefined-behavior.ll | 18 +-
.../AMDGPU/addrspacecast-constantexpr.ll | 62 -
llvm/test/CodeGen/AMDGPU/always-uniform.ll | 3 +
...amdgpu-codegenprepare-fold-binop-select.ll | 3 +
.../CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll | 4 +-
.../annotate-kernel-features-hsa-call.ll | 331 ----
.../AMDGPU/annotate-kernel-features-hsa.ll | 165 --
.../AMDGPU/annotate-kernel-features.ll | 103 --
.../attr-amdgpu-flat-work-group-size.ll | 4 +-
.../CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll | 4 +-
.../AMDGPU/attr-amdgpu-waves-per-eu.ll | 4 +-
.../attributor-flatscratchinit-globalisel.ll | 21 +-
llvm/test/CodeGen/AMDGPU/attributor-noopt.ll | 2 +-
.../callee-special-input-sgprs-fixed-abi.ll | 40 +-
llvm/test/CodeGen/AMDGPU/code-object-v3.ll | 12 +-
.../CodeGen/AMDGPU/combine-reg-or-const.ll | 3 +
...dagcomb-extract-vec-elt-different-sizes.ll | 2 +
.../AMDGPU/duplicate-attribute-indirect.ll | 13 -
...cannot-create-empty-or-backward-segment.ll | 2 +-
.../expand-scalar-carry-out-select-user.ll | 3 +
.../CodeGen/AMDGPU/extract_vector_elt-i8.ll | 100 +-
llvm/test/CodeGen/AMDGPU/fabs.f16.ll | 66 +
.../fast-unaligned-load-store.global.ll | 20 +-
llvm/test/CodeGen/AMDGPU/fcanonicalize.ll | 236 ++-
.../flat-for-global-subtarget-feature.ll | 7 +-
llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll | 80 +-
.../AMDGPU/fmul-2-combine-multi-use.ll | 48 +
llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll | 60 +
.../CodeGen/AMDGPU/fneg-modifier-casting.ll | 3 +
llvm/test/CodeGen/AMDGPU/fneg.f16.ll | 62 +-
llvm/test/CodeGen/AMDGPU/half.ll | 231 +++
.../AMDGPU/hsa-metadata-kernel-code-props.ll | 7 +-
llvm/test/CodeGen/AMDGPU/hsa.ll | 4 +-
.../AMDGPU/implicit-kernarg-backend-usage.ll | 36 +-
llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll | 24 +-
.../AMDGPU/insert_vector_elt.v2bf16.ll | 58 +-
.../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 214 ++-
.../CodeGen/AMDGPU/invalid-addrspacecast.ll | 3 +
.../CodeGen/AMDGPU/invalid-cast-load-i1.ll | 2 +
llvm/test/CodeGen/AMDGPU/kernarg-size.ll | 2 +-
llvm/test/CodeGen/AMDGPU/llc-pipeline.ll | 30 +-
.../CodeGen/AMDGPU/llvm.amdgcn.is.private.ll | 12 +
.../CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll | 12 +
.../AMDGPU/llvm.amdgcn.lds.kernel.id.ll | 8 +-
.../AMDGPU/llvm.amdgcn.readfirstlane.ll | 70 +-
.../CodeGen/AMDGPU/llvm.amdgcn.readlane.ll | 114 +-
.../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll | 126 +-
llvm/test/CodeGen/AMDGPU/load-constant-f64.ll | 6 +
llvm/test/CodeGen/AMDGPU/load-constant-i16.ll | 125 +-
llvm/test/CodeGen/AMDGPU/load-constant-i32.ll | 83 +-
llvm/test/CodeGen/AMDGPU/load-constant-i64.ll | 18 +
llvm/test/CodeGen/AMDGPU/load-constant-i8.ll | 164 +-
llvm/test/CodeGen/AMDGPU/load-global-i16.ll | 129 +-
llvm/test/CodeGen/AMDGPU/load-global-i32.ll | 105 +-
llvm/test/CodeGen/AMDGPU/load-select-ptr.ll | 3 +-
.../CodeGen/AMDGPU/mad24-get-global-id.ll | 2 +-
.../match-perm-extract-vector-elt-bug.ll | 8 +-
llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll | 300 ++--
.../AMDGPU/memory-legalizer-flat-agent.ll | 1380 +++++++++++++++++
.../memory-legalizer-flat-nontemporal.ll | 75 +
.../memory-legalizer-flat-singlethread.ll | 1380 +++++++++++++++++
.../AMDGPU/memory-legalizer-flat-system.ll | 1380 +++++++++++++++++
.../AMDGPU/memory-legalizer-flat-volatile.ll | 66 +
.../AMDGPU/memory-legalizer-flat-wavefront.ll | 1365 ++++++++++++++++
.../AMDGPU/memory-legalizer-flat-workgroup.ll | 1320 ++++++++++++++++
.../AMDGPU/memory-legalizer-global-agent.ll | 273 ++++
.../memory-legalizer-global-nontemporal.ll | 15 +
.../memory-legalizer-global-singlethread.ll | 276 ++++
.../AMDGPU/memory-legalizer-global-system.ll | 261 ++++
.../memory-legalizer-global-volatile.ll | 18 +
.../memory-legalizer-global-wavefront.ll | 276 ++++
.../memory-legalizer-global-workgroup.ll | 276 ++++
.../memory-legalizer-local-nontemporal.ll | 9 +
.../AMDGPU/memory-legalizer-local-volatile.ll | 6 +
.../memory-legalizer-private-nontemporal.ll | 59 +-
.../memory-legalizer-private-volatile.ll | 30 +-
llvm/test/CodeGen/AMDGPU/min.ll | 210 +++
llvm/test/CodeGen/AMDGPU/pack.v2f16.ll | 21 +
llvm/test/CodeGen/AMDGPU/pack.v2i16.ll | 18 +
.../AMDGPU/pal-simple-indirect-call.ll | 8 -
...al-regcopy-and-spill-missed-at-regalloc.ll | 48 +-
.../AMDGPU/preload-implicit-kernargs.ll | 178 +--
llvm/test/CodeGen/AMDGPU/preload-kernargs.ll | 379 +++--
llvm/test/CodeGen/AMDGPU/sad.ll | 114 +-
.../CodeGen/AMDGPU/scalar_to_vector.v8i16.ll | 16 +
.../scc-clobbered-sgpr-to-vmem-spill.ll | 464 +++---
.../CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll | 2 +-
llvm/test/CodeGen/AMDGPU/shift-i128.ll | 24 +-
.../CodeGen/AMDGPU/simple-indirect-call.ll | 15 -
llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll | 70 +-
.../CodeGen/AMDGPU/spill-vector-superclass.ll | 6 +-
llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll | 6 +
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll | 2 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll | 2 +-
llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll | 2 +-
llvm/test/CodeGen/AMDGPU/trap-abis.ll | 16 +-
llvm/test/CodeGen/AMDGPU/udiv.ll | 45 +
llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll | 91 +-
.../AMDGPU/vgpr-spill-placement-issue61083.ll | 2 +-
...ine-function-info-long-branch-reg-debug.ll | 7 +-
.../machine-function-info-long-branch-reg.ll | 7 +-
116 files changed, 12755 insertions(+), 1844 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index 57297288eecb4..c30c1cd3c8fb0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -95,11 +95,8 @@ void initializeAMDGPUDAGToDAGISelLegacyPass(PassRegistry &);
void initializeAMDGPUAlwaysInlinePass(PassRegistry&);
-Pass *createAMDGPUAnnotateKernelFeaturesPass();
Pass *createAMDGPUAttributorLegacyPass();
void initializeAMDGPUAttributorLegacyPass(PassRegistry &);
-void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
-extern char &AMDGPUAnnotateKernelFeaturesID;
// DPP/Iterative option enables the atomic optimizer with given strategy
// whereas None disables the atomic optimizer.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index a9bd41382c255..9c9fa5c6e2f0f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -52,11 +52,6 @@ class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass {
char AMDGPUAnnotateKernelFeatures::ID = 0;
-char &llvm::AMDGPUAnnotateKernelFeaturesID = AMDGPUAnnotateKernelFeatures::ID;
-
-INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
- "Add AMDGPU function attributes", false, false)
-
bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
bool HaveStackObjects = false;
bool Changed = false;
@@ -131,7 +126,3 @@ bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) {
TM = &TPC->getTM<TargetMachine>();
return false;
}
-
-Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
- return new AMDGPUAnnotateKernelFeatures();
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index ce3dcd920bce3..bb139e2c185c4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -510,7 +510,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
initializeAMDGPUAlwaysInlinePass(*PR);
initializeAMDGPUSwLowerLDSLegacyPass(*PR);
initializeAMDGPUAttributorLegacyPass(*PR);
- initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
initializeAMDGPUAnnotateUniformValuesLegacyPass(*PR);
initializeAMDGPUArgumentUsageInfoPass(*PR);
initializeAMDGPUAtomicOptimizerPass(*PR);
@@ -1294,12 +1293,6 @@ void AMDGPUPassConfig::addIRPasses() {
}
void AMDGPUPassConfig::addCodeGenPrepare() {
- if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
- // FIXME: This pass adds 2 hacky attributes that can be replaced with an
- // analysis, and should be removed.
- addPass(createAMDGPUAnnotateKernelFeaturesPass());
- }
-
if (TM->getTargetTriple().getArch() == Triple::amdgcn &&
EnableLowerKernelArguments)
addPass(createAMDGPULowerKernelArgumentsPass());
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
index 55af5826e90d0..c812837c29a46 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.cpp
@@ -601,12 +601,6 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
const CallingConv::ID CC = F.getCallingConv();
const bool IsKernel =
CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL;
- // FIXME: Should have analysis or something rather than attribute to detect
- // calls.
- const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
- // FIXME: This attribute is a hack, we just need an analysis on the function
- // to look for allocas.
- const bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
if (IsKernel && (!F.arg_empty() || ST.getImplicitArgNumBytes(F) != 0))
KernargSegmentPtr = true;
@@ -629,12 +623,14 @@ GCNUserSGPRUsageInfo::GCNUserSGPRUsageInfo(const Function &F,
DispatchID = true;
}
- // TODO: This could be refined a lot. The attribute is a poor way of
- // detecting calls or stack objects that may require it before argument
- // lowering.
+ const bool IsNoFlatScratchInitSet = F.hasFnAttribute("amdgpu-no-flat-scratch-init");
+
if (ST.hasFlatAddressSpace() && AMDGPU::isEntryFunctionCC(CC) &&
(IsAmdHsaOrMesa || ST.enableFlatScratch()) &&
- (HasCalls || HasStackObjects || ST.enableFlatScratch()) &&
+ // The line below: If enableFlatScratch() is true, whether
+ // no-flat-scratch-init is set is not important. If enableFlatScratch()
+ // is false, FlatScratchInit cannot be true for graphics CC.
+ (ST.enableFlatScratch() || (!IsNoFlatScratchInitSet && !AMDGPU::isGraphics(CC))) &&
!ST.flatScratchIsArchitected()) {
FlatScratchInit = true;
}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
index b96fc71be057e..6c2272c389a61 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_udec_wrap.ll
@@ -20,11 +20,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -35,11 +38,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -97,11 +103,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -112,11 +121,14 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v0 offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -287,6 +299,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_dec_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -302,6 +317,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_dec_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -359,6 +377,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_dec_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -376,6 +397,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_dec_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -436,6 +460,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
; CI-LABEL: global_atomic_dec_ret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -453,6 +480,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_system(ptr addrspace
; VI-LABEL: global_atomic_dec_ret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -513,6 +543,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_dec_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -525,6 +558,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_dec_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -575,6 +611,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_dec_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -589,6 +628,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_dec_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -642,6 +684,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
; CI-LABEL: global_atomic_dec_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -656,6 +701,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_system(ptr addrspa
; VI-LABEL: global_atomic_dec_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -710,7 +758,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -718,6 +768,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -732,7 +783,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -740,6 +793,7 @@ define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(ptr addrspace
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -802,6 +856,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -819,6 +876,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -878,6 +938,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -893,6 +956,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -908,6 +974,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_ret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -922,6 +990,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_ret_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -958,6 +1030,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -975,6 +1050,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -992,6 +1070,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -1006,6 +1086,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1045,6 +1129,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; CI-LABEL: flat_atomic_dec_ret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -1062,6 +1149,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; VI-LABEL: flat_atomic_dec_ret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -1079,6 +1169,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
; GFX9-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -1093,6 +1185,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_system(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1132,6 +1228,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1144,6 +1243,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1156,6 +1258,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -1167,6 +1271,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1199,6 +1307,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -1213,6 +1324,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -1227,6 +1341,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -1238,6 +1354,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i32_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1273,6 +1393,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -1287,6 +1410,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -1301,6 +1427,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_noret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -1312,6 +1440,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1348,7 +1480,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1356,6 +1490,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -1370,7 +1505,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1378,6 +1515,7 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_dec v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -1392,6 +1530,8 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -1410,6 +1550,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_dec_ret_i32_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, 42
@@ -1466,6 +1610,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -1483,6 +1630,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1500,6 +1650,8 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
@@ -1513,6 +1665,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i32_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -1559,10 +1715,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1580,10 +1739,13 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1601,7 +1763,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_ret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -1616,6 +1780,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_ret_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1654,12 +1822,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1677,12 +1848,15 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1700,7 +1874,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_ret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -1715,6 +1891,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_ret_i64_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1756,10 +1936,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1769,10 +1952,13 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1782,7 +1968,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -1794,6 +1982,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1828,12 +2020,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_dec_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1843,12 +2038,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_dec_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1858,7 +2056,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_dec_noret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -1870,6 +2070,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_dec_noret_i64_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1907,12 +2111,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_dec_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1922,12 +2129,15 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_dec_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1937,7 +2147,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_dec_noret_i64_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -1949,6 +2161,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -1987,6 +2203,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2013,6 +2232,9 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2039,12 +2261,14 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_dec_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2058,6 +2282,10 @@ define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_dec_ret_i64_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2116,6 +2344,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -2134,6 +2365,9 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -2152,12 +2386,14 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_dec_x2 v[3:4], v[1:2] offset:40
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -2166,6 +2402,10 @@ define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_dec_noret_i64_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2219,8 +2459,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_store_dword v[0:1], v3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2237,8 +2480,11 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0(ptr addrspace(1) %out, ptr
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_dec_rtn_u32 v2, v1, v2 offset:8
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2312,7 +2558,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -2328,7 +2577,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr add
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1]
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -2394,7 +2646,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -2410,7 +2665,10 @@ define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(ptr addrspace(1) %out,
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -2594,10 +2852,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_dec_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2610,10 +2871,13 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_dec_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2671,12 +2935,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_dec_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2689,12 +2956,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_dec_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2753,12 +3023,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
; CI-LABEL: global_atomic_dec_ret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2771,12 +3044,15 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_system(ptr addrspace
; VI-LABEL: global_atomic_dec_ret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2835,10 +3111,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_dec_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2848,10 +3127,13 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_dec_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2902,12 +3184,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_dec_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2917,12 +3202,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_dec_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -2974,12 +3262,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
; CI-LABEL: global_atomic_dec_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -2989,12 +3280,15 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_system(ptr addrspa
; VI-LABEL: global_atomic_dec_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -3047,6 +3341,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -3070,6 +3367,9 @@ define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -3144,6 +3444,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -3162,6 +3465,9 @@ define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -3232,7 +3538,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v4, s3
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: flat_store_dword v[3:4], v0
@@ -3251,7 +3560,10 @@ define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_dec_rtn_u64 v[1:2], v3, v[1:2] offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dword v[3:4], v0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
index e1397e7331d3c..e46dce24bfd53 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomicrmw_uinc_wrap.ll
@@ -21,11 +21,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -36,11 +39,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr add
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -110,11 +116,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: s_endpgm
@@ -125,11 +134,14 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0 offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -332,6 +344,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_inc_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -347,6 +362,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_inc_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -415,6 +433,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_inc_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -432,6 +453,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_inc_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -503,6 +527,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
; CI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -520,6 +547,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_sistem(ptr addrspace
; VI-LABEL: global_atomic_inc_ret_i32_offset_sistem:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -592,6 +622,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_inc_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -604,6 +637,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_inc_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -664,6 +700,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_inc_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -678,6 +717,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_inc_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -741,6 +783,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
; CI-LABEL: global_atomic_inc_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -755,6 +800,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_system(ptr addrspa
; VI-LABEL: global_atomic_inc_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -820,7 +868,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -828,6 +878,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -842,7 +893,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -850,6 +903,7 @@ define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(ptr addrspace
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -925,6 +979,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -942,6 +999,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -1019,8 +1079,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_add_i32_e32 v3, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v0, s2
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: flat_store_dword v[0:1], v3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1037,8 +1100,11 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v2 offset:8
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_add_u32_e32 v3, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v0, s2
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: flat_store_dword v[0:1], v3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1129,7 +1195,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -1145,7 +1214,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr add
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1]
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -1224,7 +1296,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CI-NEXT: s_endpgm
@@ -1240,7 +1315,10 @@ define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(ptr addrspace(1) %out,
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; VI-NEXT: s_endpgm
@@ -1459,10 +1537,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; CI-LABEL: global_atomic_inc_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1475,10 +1556,13 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64(ptr addrspace(1) %out, ptr
; VI-LABEL: global_atomic_inc_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1548,12 +1632,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; CI-LABEL: global_atomic_inc_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1566,12 +1653,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(ptr addrspace(1) %ou
; VI-LABEL: global_atomic_inc_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1642,12 +1732,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
; CI-LABEL: global_atomic_inc_ret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1660,12 +1753,15 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_system(ptr addrspace
; VI-LABEL: global_atomic_inc_ret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1737,10 +1833,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
; CI-LABEL: global_atomic_inc_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1750,10 +1849,13 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64(ptr addrspace(1) %ptr) #1
; VI-LABEL: global_atomic_inc_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1815,12 +1917,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; CI-LABEL: global_atomic_inc_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1830,12 +1935,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(ptr addrspace(1) %
; VI-LABEL: global_atomic_inc_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1898,12 +2006,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
; CI-LABEL: global_atomic_inc_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0)
@@ -1913,12 +2024,15 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_system(ptr addrspa
; VI-LABEL: global_atomic_inc_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -1983,6 +2097,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2006,6 +2123,9 @@ define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(ptr addrspace
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2094,6 +2214,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -2112,6 +2235,9 @@ define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -2188,6 +2314,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_ret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -2203,6 +2332,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_ret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -2218,6 +2350,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_ret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2232,6 +2366,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_ret_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2281,6 +2419,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_ret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -2298,6 +2439,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_ret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -2315,6 +2459,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_ret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2329,6 +2475,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_ret_i32_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2381,6 +2531,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
; CI-LABEL: flat_atomic_inc_ret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 16
@@ -2398,6 +2551,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
; VI-LABEL: flat_atomic_inc_ret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 16
@@ -2415,6 +2571,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
; GFX9-LABEL: flat_atomic_inc_ret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2429,6 +2587,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_system(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2482,6 +2644,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2494,6 +2659,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2506,6 +2674,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i32:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -2517,6 +2687,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i32:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2560,6 +2734,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i32_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -2574,6 +2751,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i32_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -2588,6 +2768,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i32_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -2599,6 +2781,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i32_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2645,6 +2831,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_noret_i32_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: v_mov_b32_e32 v2, 42
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 16
@@ -2659,6 +2848,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_noret_i32_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: v_mov_b32_e32 v2, 42
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 16
@@ -2673,6 +2865,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_noret_i32_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v2, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
@@ -2684,6 +2878,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v2, 42
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2732,7 +2930,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; CI-NEXT: v_mov_b32_e32 v3, 42
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2740,6 +2940,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v0, vcc, 20, v0
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, 42
; CI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_wbinvl1_vol
@@ -2754,7 +2955,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
-; VI-NEXT: v_mov_b32_e32 v3, 42
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2762,6 +2965,7 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 20, v0
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, 42
; VI-NEXT: flat_atomic_inc v3, v[0:1], v3 glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; VI-NEXT: buffer_wbinvl1_vol
@@ -2776,6 +2980,8 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v3, 42
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
@@ -2794,6 +3000,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i32_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: v_mov_b32_e32 v3, 42
@@ -2872,6 +3082,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -2889,6 +3102,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -2906,6 +3122,8 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s0
; GFX9-NEXT: v_mov_b32_e32 v1, s1
@@ -2919,6 +3137,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i32_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 2, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -2989,7 +3211,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
; CI-NEXT: s_waitcnt lgkmcnt(0)
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v4, s3
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_add_i32_e32 v0, vcc, 2, v0
; CI-NEXT: v_mov_b32_e32 v3, s2
; CI-NEXT: flat_store_dword v[3:4], v0
@@ -3008,7 +3233,10 @@ define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(ptr addrspace(1) %out,
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: ds_inc_rtn_u64 v[1:2], v3, v[1:2] offset:16
; VI-NEXT: s_waitcnt lgkmcnt(0)
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v4, s3
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0
; VI-NEXT: v_mov_b32_e32 v3, s2
; VI-NEXT: flat_store_dword v[3:4], v0
@@ -3098,10 +3326,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_ret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3119,10 +3350,13 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_ret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3140,7 +3374,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_ret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -3155,6 +3391,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64(ptr %out, ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_ret_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3207,12 +3447,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_ret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3230,12 +3473,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_ret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3253,7 +3499,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_ret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -3268,6 +3516,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(ptr %out, ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_ret_i64_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3323,12 +3575,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
; CI-LABEL: flat_atomic_inc_ret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s2, s2, 32
; CI-NEXT: s_addc_u32 s3, s3, 0
; CI-NEXT: v_mov_b32_e32 v2, s2
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3346,12 +3601,15 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
; VI-LABEL: flat_atomic_inc_ret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s2, s2, 32
; VI-NEXT: s_addc_u32 s3, s3, 0
; VI-NEXT: v_mov_b32_e32 v2, s2
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3369,7 +3627,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
; GFX9-LABEL: flat_atomic_inc_ret_i64_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v2, s2
@@ -3384,6 +3644,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_system(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3440,10 +3704,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3453,10 +3720,13 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3466,7 +3736,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i64:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -3478,6 +3750,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3524,12 +3800,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; CI-LABEL: flat_atomic_inc_noret_i64_offset:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3539,12 +3818,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; VI-LABEL: flat_atomic_inc_noret_i64_offset:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3554,7 +3836,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
; GFX9-LABEL: flat_atomic_inc_noret_i64_offset:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -3566,6 +3850,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(ptr %ptr) #1 {
;
; GFX10-LABEL: flat_atomic_inc_noret_i64_offset:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3615,12 +3903,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; CI-LABEL: flat_atomic_inc_noret_i64_offset_system:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v0, 42
-; CI-NEXT: v_mov_b32_e32 v1, 0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s0, s0, 32
; CI-NEXT: s_addc_u32 s1, s1, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v1, 0
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3630,12 +3921,15 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; VI-LABEL: flat_atomic_inc_noret_i64_offset_system:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v0, 42
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s0, s0, 32
; VI-NEXT: s_addc_u32 s1, s1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[0:1]
; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3645,7 +3939,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
; GFX9-LABEL: flat_atomic_inc_noret_i64_offset_system:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v0, 42
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: v_mov_b32_e32 v1, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -3657,6 +3953,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_system(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_system:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_mov_b32_e32 v0, 42
; GFX10-NEXT: v_mov_b32_e32 v1, 0
@@ -3708,6 +4008,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -3734,6 +4037,9 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -3760,12 +4066,14 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s3
; GFX9-NEXT: v_mov_b32_e32 v3, s2
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v5
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3779,6 +4087,10 @@ define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(ptr %out, ptr %
;
; GFX10-LABEL: flat_atomic_inc_ret_i64_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v4, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -3860,6 +4172,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -3878,6 +4193,9 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -3896,12 +4214,14 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
; GFX9-NEXT: v_mov_b32_e32 v1, 42
-; GFX9-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v4, s1
; GFX9-NEXT: v_mov_b32_e32 v3, s0
; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0
+; GFX9-NEXT: v_mov_b32_e32 v2, 0
; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
; GFX9-NEXT: flat_atomic_inc_x2 v[3:4], v[1:2] offset:40
; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -3910,6 +4230,10 @@ define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(ptr %ptr) #1
;
; GFX10-LABEL: flat_atomic_inc_noret_i64_offset_addr64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: v_lshlrev_b32_e32 v2, 3, v0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
@@ -3977,6 +4301,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_mov_b32_e32 v0, 42
; CI-NEXT: s_mov_b32 m0, -1
+; CI-NEXT: s_add_i32 s12, s12, s17
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s4
; CI-NEXT: ds_inc_rtn_u32 v2, v1, v0
@@ -3984,6 +4309,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; CI-NEXT: ds_inc_rtn_u32 v3, v1, v0
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: v_mov_b32_e32 v0, s2
@@ -3997,6 +4324,7 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 42
; VI-NEXT: s_mov_b32 m0, -1
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s4
; VI-NEXT: ds_inc_rtn_u32 v2, v1, v0
@@ -4004,6 +4332,8 @@ define amdgpu_kernel void @nocse_lds_atomic_inc_ret_i32(ptr addrspace(1) %out0,
; VI-NEXT: ds_inc_rtn_u32 v3, v1, v0
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: v_mov_b32_e32 v0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
index c136028f2de43..1758fb85a0b21 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll
@@ -3016,7 +3016,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: kernel_code_entry_byte_offset = 256
; GPRIDX-NEXT: kernel_code_prefetch_byte_size = 0
; GPRIDX-NEXT: granulated_workitem_vgpr_count = 0
-; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 1
+; GPRIDX-NEXT: granulated_wavefront_sgpr_count = 2
; GPRIDX-NEXT: priority = 0
; GPRIDX-NEXT: float_mode = 240
; GPRIDX-NEXT: priv = 0
@@ -3027,7 +3027,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT: user_sgpr_count = 12
+; GPRIDX-NEXT: user_sgpr_count = 14
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -3042,7 +3042,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1
; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1
; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -3059,7 +3059,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 15
+; GPRIDX-NEXT: wavefront_sgpr_count = 17
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -3107,7 +3107,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -3118,7 +3118,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT: user_sgpr_count = 12
+; MOVREL-NEXT: user_sgpr_count = 14
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -3133,7 +3133,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: enable_sgpr_queue_ptr = 1
; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; MOVREL-NEXT: enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1
; MOVREL-NEXT: enable_sgpr_private_segment_size = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -3150,7 +3150,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 10
+; MOVREL-NEXT: wavefront_sgpr_count = 24
; MOVREL-NEXT: workitem_vgpr_count = 4
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
@@ -3168,21 +3168,24 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; MOVREL-NEXT: ; %bb.0: ; %entry
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; MOVREL-NEXT: s_load_dword s8, s[8:9], 0x8
+; MOVREL-NEXT: s_add_i32 s12, s12, s17
+; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; MOVREL-NEXT: s_mov_b32 s4, 0
; MOVREL-NEXT: s_mov_b32 s5, 0x40080000
-; MOVREL-NEXT: s_mov_b32 s2, 0
-; MOVREL-NEXT: s_mov_b32 s3, 0x40140000
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s8, 1
; MOVREL-NEXT: s_cselect_b64 s[6:7], 2.0, 1.0
; MOVREL-NEXT: s_cmp_eq_u32 s8, 2
+; MOVREL-NEXT: s_mov_b32 s2, 0
; MOVREL-NEXT: s_cselect_b64 s[4:5], s[4:5], s[6:7]
; MOVREL-NEXT: s_cmp_eq_u32 s8, 3
+; MOVREL-NEXT: s_mov_b32 s3, 0x40140000
; MOVREL-NEXT: s_cselect_b64 s[4:5], 4.0, s[4:5]
; MOVREL-NEXT: s_cmp_eq_u32 s8, 4
; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
; MOVREL-NEXT: v_mov_b32_e32 v0, s2
; MOVREL-NEXT: v_mov_b32_e32 v3, s1
+; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13
; MOVREL-NEXT: v_mov_b32_e32 v1, s3
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -3210,7 +3213,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT: user_sgpr_count = 12
+; GFX10-NEXT: user_sgpr_count = 14
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -3225,7 +3228,7 @@ define amdgpu_kernel void @dyn_extract_v5f64_s_s(ptr addrspace(1) %out, i32 %sel
; GFX10-NEXT: enable_sgpr_queue_ptr = 1
; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GFX10-NEXT: enable_sgpr_dispatch_id = 1
-; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1
; GFX10-NEXT: enable_sgpr_private_segment_size = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4042,7 +4045,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT: user_sgpr_count = 12
+; GPRIDX-NEXT: user_sgpr_count = 14
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4057,7 +4060,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1
; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1
; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4074,7 +4077,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 14
+; GPRIDX-NEXT: wavefront_sgpr_count = 16
; GPRIDX-NEXT: workitem_vgpr_count = 2
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -4115,7 +4118,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -4126,7 +4129,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT: user_sgpr_count = 12
+; MOVREL-NEXT: user_sgpr_count = 14
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4141,7 +4144,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_sgpr_queue_ptr = 1
; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; MOVREL-NEXT: enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1
; MOVREL-NEXT: enable_sgpr_private_segment_size = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4158,7 +4161,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 10
+; MOVREL-NEXT: wavefront_sgpr_count = 24
; MOVREL-NEXT: workitem_vgpr_count = 3
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
@@ -4176,6 +4179,9 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: ; %bb.0: ; %entry
; MOVREL-NEXT: s_load_dword s2, s[8:9], 0x8
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; MOVREL-NEXT: s_add_i32 s12, s12, s17
+; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s2, 1
; MOVREL-NEXT: s_cselect_b32 s3, 2.0, 1.0
@@ -4211,7 +4217,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT: user_sgpr_count = 12
+; GFX10-NEXT: user_sgpr_count = 14
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4226,7 +4232,7 @@ define amdgpu_kernel void @dyn_extract_v4f32_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_sgpr_queue_ptr = 1
; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GFX10-NEXT: enable_sgpr_dispatch_id = 1
-; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1
; GFX10-NEXT: enable_sgpr_private_segment_size = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4387,7 +4393,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_mem_ordered = 0
; GPRIDX-NEXT: enable_fwd_progress = 0
; GPRIDX-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GPRIDX-NEXT: user_sgpr_count = 12
+; GPRIDX-NEXT: user_sgpr_count = 14
; GPRIDX-NEXT: enable_trap_handler = 0
; GPRIDX-NEXT: enable_sgpr_workgroup_id_x = 1
; GPRIDX-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4402,7 +4408,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: enable_sgpr_queue_ptr = 1
; GPRIDX-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GPRIDX-NEXT: enable_sgpr_dispatch_id = 1
-; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 0
+; GPRIDX-NEXT: enable_sgpr_flat_scratch_init = 1
; GPRIDX-NEXT: enable_sgpr_private_segment_size = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GPRIDX-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4419,7 +4425,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GPRIDX-NEXT: gds_segment_byte_size = 0
; GPRIDX-NEXT: kernarg_segment_byte_size = 28
; GPRIDX-NEXT: workgroup_fbarrier_count = 0
-; GPRIDX-NEXT: wavefront_sgpr_count = 14
+; GPRIDX-NEXT: wavefront_sgpr_count = 16
; GPRIDX-NEXT: workitem_vgpr_count = 3
; GPRIDX-NEXT: reserved_vgpr_first = 0
; GPRIDX-NEXT: reserved_vgpr_count = 0
@@ -4463,7 +4469,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: kernel_code_entry_byte_offset = 256
; MOVREL-NEXT: kernel_code_prefetch_byte_size = 0
; MOVREL-NEXT: granulated_workitem_vgpr_count = 0
-; MOVREL-NEXT: granulated_wavefront_sgpr_count = 1
+; MOVREL-NEXT: granulated_wavefront_sgpr_count = 2
; MOVREL-NEXT: priority = 0
; MOVREL-NEXT: float_mode = 240
; MOVREL-NEXT: priv = 0
@@ -4474,7 +4480,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_mem_ordered = 0
; MOVREL-NEXT: enable_fwd_progress = 0
; MOVREL-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; MOVREL-NEXT: user_sgpr_count = 12
+; MOVREL-NEXT: user_sgpr_count = 14
; MOVREL-NEXT: enable_trap_handler = 0
; MOVREL-NEXT: enable_sgpr_workgroup_id_x = 1
; MOVREL-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4489,7 +4495,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: enable_sgpr_queue_ptr = 1
; MOVREL-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; MOVREL-NEXT: enable_sgpr_dispatch_id = 1
-; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 0
+; MOVREL-NEXT: enable_sgpr_flat_scratch_init = 1
; MOVREL-NEXT: enable_sgpr_private_segment_size = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; MOVREL-NEXT: enable_sgpr_grid_workgroup_count_y = 0
@@ -4506,7 +4512,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: gds_segment_byte_size = 0
; MOVREL-NEXT: kernarg_segment_byte_size = 28
; MOVREL-NEXT: workgroup_fbarrier_count = 0
-; MOVREL-NEXT: wavefront_sgpr_count = 10
+; MOVREL-NEXT: wavefront_sgpr_count = 24
; MOVREL-NEXT: workitem_vgpr_count = 4
; MOVREL-NEXT: reserved_vgpr_first = 0
; MOVREL-NEXT: reserved_vgpr_count = 0
@@ -4524,10 +4530,12 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: ; %bb.0: ; %entry
; MOVREL-NEXT: s_load_dword s6, s[8:9], 0x8
; MOVREL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; MOVREL-NEXT: s_add_i32 s12, s12, s17
+; MOVREL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; MOVREL-NEXT: s_mov_b32 s2, 0
-; MOVREL-NEXT: s_mov_b32 s3, 0x40080000
; MOVREL-NEXT: s_waitcnt lgkmcnt(0)
; MOVREL-NEXT: s_cmp_eq_u32 s6, 1
+; MOVREL-NEXT: s_mov_b32 s3, 0x40080000
; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0
; MOVREL-NEXT: s_cmp_eq_u32 s6, 2
; MOVREL-NEXT: s_cselect_b64 s[2:3], s[2:3], s[4:5]
@@ -4535,6 +4543,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; MOVREL-NEXT: s_cselect_b64 s[2:3], 4.0, s[2:3]
; MOVREL-NEXT: v_mov_b32_e32 v0, s2
; MOVREL-NEXT: v_mov_b32_e32 v3, s1
+; MOVREL-NEXT: s_mov_b32 flat_scratch_lo, s13
; MOVREL-NEXT: v_mov_b32_e32 v1, s3
; MOVREL-NEXT: v_mov_b32_e32 v2, s0
; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -4562,7 +4571,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_mem_ordered = 1
; GFX10-NEXT: enable_fwd_progress = 0
; GFX10-NEXT: enable_sgpr_private_segment_wave_byte_offset = 0
-; GFX10-NEXT: user_sgpr_count = 12
+; GFX10-NEXT: user_sgpr_count = 14
; GFX10-NEXT: enable_trap_handler = 0
; GFX10-NEXT: enable_sgpr_workgroup_id_x = 1
; GFX10-NEXT: enable_sgpr_workgroup_id_y = 1
@@ -4577,7 +4586,7 @@ define amdgpu_kernel void @dyn_extract_v4f64_s_s_s(ptr addrspace(1) %out, i32 %s
; GFX10-NEXT: enable_sgpr_queue_ptr = 1
; GFX10-NEXT: enable_sgpr_kernarg_segment_ptr = 1
; GFX10-NEXT: enable_sgpr_dispatch_id = 1
-; GFX10-NEXT: enable_sgpr_flat_scratch_init = 0
+; GFX10-NEXT: enable_sgpr_flat_scratch_init = 1
; GFX10-NEXT: enable_sgpr_private_segment_size = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_x = 0
; GFX10-NEXT: enable_sgpr_grid_workgroup_count_y = 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
index 00c44c27257bb..e207d95287783 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/flat-scratch-init.ll
@@ -35,7 +35,7 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
; RO-FLAT: scratch_store_dword
; RW-FLAT: .amdhsa_user_sgpr_private_segment_buffer 1
; RO-FLAT-NOT: .amdhsa_user_sgpr_private_segment_buffer
-; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 1
+; RW-FLAT: .amdhsa_user_sgpr_flat_scratch_init 0
; RO-FLAT-NOT: .amdhsa_user_sgpr_flat_scratch_init
; RW-FLAT: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; RW-FLAT-NOT: .amdhsa_enable_private_segment
@@ -43,7 +43,7 @@ define amdgpu_kernel void @stack_object_addrspacecast_in_kernel_no_calls() {
; RO-FLAT: .amdhsa_enable_private_segment 1
; RW-FLAT: .amdhsa_reserve_flat_scratch 0
; GCN: COMPUTE_PGM_RSRC2:SCRATCH_EN: 1
-; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 6
+; RW-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 4
; RO-FLAT: COMPUTE_PGM_RSRC2:USER_SGPR: 0
define amdgpu_kernel void @stack_object_in_kernel_no_calls() {
%alloca = alloca i32, addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
index 54cb0777e9b2b..c778b674af8ca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/implicit-kernarg-backend-usage-global-isel.ll
@@ -12,7 +12,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40
-; GFX8V4-NEXT: v_mov_b32_e32 v2, 1
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_mov_b32 s4, s0
; GFX8V4-NEXT: s_mov_b32 s5, s3
@@ -23,6 +25,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V4-NEXT: s_cmp_lg_u32 s1, -1
; GFX8V4-NEXT: v_mov_b32_e32 v0, s4
; GFX8V4-NEXT: s_cselect_b64 s[0:1], s[6:7], 0
+; GFX8V4-NEXT: v_mov_b32_e32 v2, 1
; GFX8V4-NEXT: v_mov_b32_e32 v1, s5
; GFX8V4-NEXT: flat_store_dword v[0:1], v2
; GFX8V4-NEXT: s_waitcnt vmcnt(0)
@@ -37,7 +40,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V5: ; %bb.0:
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0xc8
-; GFX8V5-NEXT: v_mov_b32_e32 v2, 1
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_mov_b32 s4, s0
; GFX8V5-NEXT: s_mov_b32 s5, s2
@@ -47,6 +52,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V5-NEXT: s_cmp_lg_u32 s1, -1
; GFX8V5-NEXT: v_mov_b32_e32 v0, s4
; GFX8V5-NEXT: s_cselect_b64 s[0:1], s[2:3], 0
+; GFX8V5-NEXT: v_mov_b32_e32 v2, 1
; GFX8V5-NEXT: v_mov_b32_e32 v1, s5
; GFX8V5-NEXT: flat_store_dword v[0:1], v2
; GFX8V5-NEXT: s_waitcnt vmcnt(0)
@@ -60,9 +66,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V4-LABEL: addrspacecast:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX9V4-NEXT: v_mov_b32_e32 v2, 1
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V4-NEXT: s_mov_b32 s2, s0
; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1
@@ -71,6 +78,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V4-NEXT: s_cmp_lg_u32 s1, -1
; GFX9V4-NEXT: v_mov_b32_e32 v0, s2
; GFX9V4-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
+; GFX9V4-NEXT: v_mov_b32_e32 v2, 1
; GFX9V4-NEXT: v_mov_b32_e32 v1, s3
; GFX9V4-NEXT: flat_store_dword v[0:1], v2
; GFX9V4-NEXT: s_waitcnt vmcnt(0)
@@ -84,9 +92,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V5-LABEL: addrspacecast:
; GFX9V5: ; %bb.0:
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX9V5-NEXT: v_mov_b32_e32 v2, 1
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V5-NEXT: s_mov_b32 s2, s0
; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1
@@ -95,6 +104,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V5-NEXT: s_cmp_lg_u32 s1, -1
; GFX9V5-NEXT: v_mov_b32_e32 v0, s2
; GFX9V5-NEXT: s_cselect_b64 s[0:1], s[4:5], 0
+; GFX9V5-NEXT: v_mov_b32_e32 v2, 1
; GFX9V5-NEXT: v_mov_b32_e32 v1, s3
; GFX9V5-NEXT: flat_store_dword v[0:1], v2
; GFX9V5-NEXT: s_waitcnt vmcnt(0)
@@ -117,6 +127,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0
@@ -130,6 +143,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0
@@ -173,6 +189,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V4-NEXT: s_cselect_b32 s0, 1, 0
@@ -186,6 +205,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V5-NEXT: s_cselect_b32 s0, 1, 0
@@ -269,7 +291,10 @@ define amdgpu_kernel void @llvm_debugtrap() {
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V4: ; %bb.0:
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
; GFX8V4-NEXT: v_mov_b32_e32 v0, s6
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8V4-NEXT: v_mov_b32_e32 v1, s7
; GFX8V4-NEXT: s_add_u32 s0, s8, 8
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -295,7 +320,10 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
;
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V5: ; %bb.0:
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
; GFX8V5-NEXT: v_mov_b32_e32 v0, s6
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8V5-NEXT: v_mov_b32_e32 v1, s7
; GFX8V5-NEXT: s_add_u32 s0, s8, 8
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
index c3b48b5d2ddff..7f5a757bac68b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @v_insert_v64i32_varidx(ptr addrspace(1) %out.ptr, ptr
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[20:23], s[8:9], 0x0
; GCN-NEXT: s_load_dwordx2 s[24:25], s[8:9], 0x10
-; GCN-NEXT: s_add_u32 s0, s0, s15
+; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: v_mov_b32_e32 v64, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
index b250e016492bc..0f2d70da2a8f2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lds-global-value.ll
@@ -11,13 +11,16 @@ define amdgpu_kernel void @use_lds_globals(ptr addrspace(1) %out, ptr addrspace(
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v0, 4
; CHECK-NEXT: s_mov_b32 m0, -1
+; CHECK-NEXT: s_add_i32 s12, s12, s17
; CHECK-NEXT: ds_read_b32 v2, v0
-; CHECK-NEXT: v_mov_b32_e32 v3, 9
+; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_add_u32 s0, s0, 4
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-NEXT: v_mov_b32_e32 v3, 9
; CHECK-NEXT: flat_store_dword v[0:1], v2
; CHECK-NEXT: v_mov_b32_e32 v0, 0x200
; CHECK-NEXT: ds_write_b32 v0, v3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
index 78b2a5bf1050c..d6fd926e09ea3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.if.break.i64.ll
@@ -6,6 +6,9 @@ define amdgpu_kernel void @test_wave64(i32 %arg0, [8 x i32], i64 %saved) {
; GCN: ; %bb.0: ; %entry
; GCN-NEXT: s_load_dword s2, s[8:9], 0x0
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0xa
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_cmp_eq_u32 s2, 0
; GCN-NEXT: s_cselect_b32 s2, 1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
index dd2f26f7b73a1..bffb12ca56859 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.trig.preop.ll
@@ -42,6 +42,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -59,6 +62,9 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -76,6 +82,8 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], v0
@@ -85,6 +93,10 @@ define amdgpu_kernel void @s_trig_preop_f64(double %a, i32 %b) {
;
; GFX10-LABEL: s_trig_preop_f64:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_clause 0x1
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: s_load_dword s2, s[8:9], 0x8
@@ -113,6 +125,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
; CI-LABEL: s_trig_preop_f64_imm:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
; CI-NEXT: s_add_u32 s0, s0, 4
@@ -128,6 +143,9 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
; VI-LABEL: s_trig_preop_f64_imm:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
; VI-NEXT: s_add_u32 s0, s0, 4
@@ -143,6 +161,8 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
; GFX9-LABEL: s_trig_preop_f64_imm:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
; GFX9-NEXT: flat_store_dwordx2 v[0:1], v[0:1]
@@ -151,6 +171,10 @@ define amdgpu_kernel void @s_trig_preop_f64_imm(double %a, i32 %b) {
;
; GFX10-LABEL: s_trig_preop_f64_imm:
; GFX10: ; %bb.0:
+; GFX10-NEXT: s_add_u32 s12, s12, s17
+; GFX10-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: v_trig_preop_f64 v[0:1], s[0:1], 7
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index 40f29c56c8f12..b59f85b2dfa38 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -7,6 +7,9 @@ define amdgpu_kernel void @sdivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-LABEL: sdivrem_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_ashr_i32 s6, s5, 31
; GFX8-NEXT: s_add_i32 s0, s5, s6
@@ -146,6 +149,9 @@ define amdgpu_kernel void @sdivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-LABEL: sdivrem_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_ashr_i32 s2, s9, 31
; GFX8-NEXT: s_ashr_i32 s12, s11, 31
@@ -617,6 +623,9 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-LABEL: sdivrem_v2i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_ashr_i32 s2, s10, 31
; GFX8-NEXT: s_add_i32 s0, s10, s2
@@ -845,6 +854,9 @@ define amdgpu_kernel void @sdivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
; GFX8-LABEL: sdivrem_v4i32:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1271,6 +1283,9 @@ define amdgpu_kernel void @sdivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
define amdgpu_kernel void @sdivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
; GFX8-LABEL: sdivrem_v2i64:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x0
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x20
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -2187,6 +2202,9 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
; GFX8-LABEL: sdiv_i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s4, 0x80008
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
@@ -2332,6 +2350,9 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-LABEL: sdivrem_v2i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s2, 0x80010
; GFX8-NEXT: s_ashr_i32 s3, s0, 31
@@ -2596,6 +2617,9 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
; GFX8-LABEL: sdiv_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s4, 0x100010
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
@@ -2741,6 +2765,9 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-LABEL: sdivrem_v2i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_sext_i32_i16 s0, s3
; GFX8-NEXT: s_ashr_i32 s10, s0, 31
@@ -3002,6 +3029,9 @@ define amdgpu_kernel void @sdivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
; GFX8-LABEL: sdivrem_i3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s4, 0x30008
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
@@ -3153,6 +3183,9 @@ define amdgpu_kernel void @sdivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-LABEL: sdivrem_i27:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_i32 s0, s5, 0x1b0000
; GFX8-NEXT: s_ashr_i32 s5, s0, 31
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index e3c1a52696b47..ff0114cfc3ddb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -7,6 +7,9 @@ define amdgpu_kernel void @udivrem_i32(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-LABEL: udivrem_i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
; GFX8-NEXT: s_sub_i32 s0, 0, s5
@@ -113,6 +116,9 @@ define amdgpu_kernel void @udivrem_i64(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-LABEL: udivrem_i64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s11
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s10
@@ -523,6 +529,9 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8-LABEL: udivrem_v2i32:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s10
; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s11
@@ -685,6 +694,9 @@ define amdgpu_kernel void @udivrem_v2i32(ptr addrspace(1) %out0, ptr addrspace(1
define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <4 x i32> %x, <4 x i32> %y) {
; GFX8-LABEL: udivrem_v4i32:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx8 s[12:19], s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -980,6 +992,9 @@ define amdgpu_kernel void @udivrem_v4i32(ptr addrspace(1) %out0, ptr addrspace(1
define amdgpu_kernel void @udivrem_v2i64(ptr addrspace(1) %out0, ptr addrspace(1) %out1, <2 x i64> %x, <2 x i64> %y) {
; GFX8-LABEL: udivrem_v2i64:
; GFX8: ; %bb.0:
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x20
; GFX8-NEXT: s_load_dwordx8 s[4:11], s[8:9], 0x0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
@@ -1772,6 +1787,9 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out0, ptr addrspace(1) %out
; GFX8-LABEL: udiv_i8:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s5, s4, 0x80008
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
@@ -1885,6 +1903,9 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s0, s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s2, s0, 0x80010
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s2
@@ -2081,6 +2102,9 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out0, ptr addrspace(1) %ou
; GFX8-LABEL: udiv_i16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_lshr_b32 s5, s4, 16
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
@@ -2194,6 +2218,9 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x10
; GFX8-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s2, s1, 0xffff
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2
@@ -2387,6 +2414,9 @@ define amdgpu_kernel void @udivrem_i3(ptr addrspace(1) %out0, ptr addrspace(1) %
; GFX8-LABEL: udivrem_i3:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_bfe_u32 s5, s4, 0x30008
; GFX8-NEXT: v_cvt_f32_ubyte0_e32 v0, s5
@@ -2505,6 +2535,9 @@ define amdgpu_kernel void @udivrem_i27(ptr addrspace(1) %out0, ptr addrspace(1)
; GFX8-LABEL: udivrem_i27:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_and_b32 s5, s5, 0x7ffffff
; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s5
diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
index b7fbb8b2236c3..805ebcb26274a 100644
--- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
+++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll
@@ -135,6 +135,9 @@ define amdgpu_kernel void @marked_kernel_use_workitem_id(ptr addrspace(1) %ptr)
; FIXEDABI-LABEL: marked_kernel_use_workitem_id:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
+; FIXEDABI-NEXT: s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0)
; FIXEDABI-NEXT: v_mov_b32_e32 v4, s1
; FIXEDABI-NEXT: v_mov_b32_e32 v3, s0
@@ -181,16 +184,19 @@ define amdgpu_kernel void @marked_kernel_use_workgroup_id(ptr addrspace(1) %ptr)
; FIXEDABI-LABEL: marked_kernel_use_workgroup_id:
; FIXEDABI: ; %bb.0:
; FIXEDABI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s6
+; FIXEDABI-NEXT: s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
+; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8
; FIXEDABI-NEXT: s_waitcnt lgkmcnt(0)
; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0
; FIXEDABI-NEXT: v_mov_b32_e32 v1, s1
; FIXEDABI-NEXT: flat_store_dword v[0:1], v2
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s7
+; FIXEDABI-NEXT: v_mov_b32_e32 v2, s9
; FIXEDABI-NEXT: flat_store_dword v[0:1], v2
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
-; FIXEDABI-NEXT: v_mov_b32_e32 v2, s8
+; FIXEDABI-NEXT: v_mov_b32_e32 v2, s10
; FIXEDABI-NEXT: flat_store_dword v[0:1], v2
; FIXEDABI-NEXT: s_waitcnt vmcnt(0)
; FIXEDABI-NEXT: s_endpgm
@@ -238,6 +244,9 @@ define void @marked_func_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #0 {
; FIXEDABI-LABEL: marked_kernel_use_other_sgpr:
; FIXEDABI: ; %bb.0:
+; FIXEDABI-NEXT: s_add_i32 s6, s6, s11
+; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s7
+; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s6, 8
; FIXEDABI-NEXT: s_add_u32 s0, s4, 8
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0
@@ -261,7 +270,10 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(ptr addrspace(1) %ptr) #
define amdgpu_kernel void @marked_kernel_nokernargs_implicitarg_ptr() #0 {
; FIXEDABI-LABEL: marked_kernel_nokernargs_implicitarg_ptr:
; FIXEDABI: ; %bb.0:
+; FIXEDABI-NEXT: s_add_i32 s4, s4, s9
; FIXEDABI-NEXT: v_mov_b32_e32 v0, 0
+; FIXEDABI-NEXT: s_mov_b32 flat_scratch_lo, s5
+; FIXEDABI-NEXT: s_lshr_b32 flat_scratch_hi, s4, 8
; FIXEDABI-NEXT: v_mov_b32_e32 v1, 0
; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc
; FIXEDABI-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index d316e10037757..da1a4c93dd6d2 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s
declare void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) nocapture, ptr addrspace(4) nocapture, i32, i1) #0
@@ -27,11 +26,6 @@ define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 {
}
define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@store_cast_0_group_to_flat_addrspacecast
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) null to ptr addrspace(4)), align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_cast_0_group_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) null to ptr addrspace(4)), align 4
@@ -42,11 +36,6 @@ define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 {
}
define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_to_flat
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.i32 to ptr addrspace(4)), align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_to_flat
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.i32 to ptr addrspace(4)), align 4
@@ -57,11 +46,6 @@ define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 {
}
define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: store i32 7, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4
@@ -92,12 +76,6 @@ define amdgpu_kernel void @store_constant_cast_global_gv_gep_to_flat() #1 {
}
define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@load_constant_cast_group_gv_gep_to_flat
-; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@load_constant_cast_group_gv_gep_to_flat
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), align 4
@@ -110,12 +88,6 @@ define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(ptr addrspace
}
define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@atomicrmw_constant_cast_group_gv_gep_to_flat
-; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 1 seq_cst, align 4
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[OUT]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@atomicrmw_constant_cast_group_gv_gep_to_flat
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = atomicrmw add ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 1 seq_cst, align 4
@@ -128,13 +100,6 @@ define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(ptr addr
}
define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@cmpxchg_constant_cast_group_gv_gep_to_flat
-; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = cmpxchg ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst, align 4
-; AKF_HSA-NEXT: [[VAL0:%.*]] = extractvalue { i32, i1 } [[VAL]], 0
-; AKF_HSA-NEXT: store i32 [[VAL0]], ptr addrspace(1) [[OUT]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@cmpxchg_constant_cast_group_gv_gep_to_flat
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = cmpxchg ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst, align 4
@@ -149,11 +114,6 @@ define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(ptr addrsp
}
define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@memcpy_constant_cast_group_gv_gep_to_flat
-; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) align 4 [[OUT]], ptr addrspace(4) align 4 getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 32, i1 false)
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@memcpy_constant_cast_group_gv_gep_to_flat
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: call void @llvm.memcpy.p1.p4.i32(ptr addrspace(1) align 4 [[OUT]], ptr addrspace(4) align 4 getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), i32 32, i1 false)
@@ -165,11 +125,6 @@ define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(ptr addrspa
; Can't just search the pointer value
define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@store_value_constant_cast_lds_gv_gep_to_flat
-; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: store ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), ptr addrspace(1) [[OUT]], align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_value_constant_cast_lds_gv_gep_to_flat
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: store ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8), ptr addrspace(1) [[OUT]], align 8
@@ -181,11 +136,6 @@ define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(ptr addr
; Can't just search pointer types
define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(ptr addrspace(1) %out) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat
-; AKF_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[OUT:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: store i64 ptrtoint (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to i64), ptr addrspace(1) [[OUT]], align 8
@@ -197,11 +147,6 @@ define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat
; Cast group to flat, do GEP, cast back to group
define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat_to_group
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: store i32 7, ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)), align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@store_constant_cast_group_gv_gep_to_flat_to_group
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: store i32 7, ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3)), align 4
@@ -212,10 +157,6 @@ define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #
}
define ptr addrspace(3) @ret_constant_cast_group_gv_gep_to_flat_to_group() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3))
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@ret_constant_cast_group_gv_gep_to_flat_to_group
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: ret ptr addrspace(3) addrspacecast (ptr addrspace(4) getelementptr ([256 x i32], ptr addrspace(4) addrspacecast (ptr addrspace(3) @lds.arr to ptr addrspace(4)), i64 0, i64 8) to ptr addrspace(3))
@@ -229,14 +170,11 @@ attributes #1 = { nounwind }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
-; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
;.
; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.
-; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
; ATTRIBUTOR_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/always-uniform.ll b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
index b6c0271e5f56f..4e7022710c671 100644
--- a/llvm/test/CodeGen/AMDGPU/always-uniform.ll
+++ b/llvm/test/CodeGen/AMDGPU/always-uniform.ll
@@ -8,8 +8,10 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
; GCN-LABEL: readfirstlane_uniform:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
; GCN-NEXT: v_readfirstlane_b32 s4, v0
; GCN-NEXT: s_mov_b32 s5, 0
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s0, s0, s4
@@ -18,6 +20,7 @@ define amdgpu_kernel void @readfirstlane_uniform(ptr addrspace(1) noalias nocapt
; GCN-NEXT: s_add_u32 s0, s2, 40
; GCN-NEXT: s_addc_u32 s1, s3, 0
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
index 598b4a5fcbd33..fd4bc0aff05cc 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll
@@ -393,6 +393,9 @@ define amdgpu_kernel void @select_add_lhs_const_i16(i1 %cond) {
; GCN-LABEL: select_add_lhs_const_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_bitcmp1_b32 s0, 0
; GCN-NEXT: s_movk_i32 s0, 0x80
diff --git a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
index 3e19ee5567929..a4fe7121e347d 100644
--- a/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdhsa-trap-num-sgprs.ll
@@ -2,8 +2,8 @@
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -mattr=-trap-handler < %s | FileCheck %s --check-prefixes=GCN,TRAP-HANDLER-DISABLE
; GCN-LABEL: {{^}}amdhsa_trap_num_sgprs
-; TRAP-HANDLER-ENABLE: NumSgprs: 61
-; TRAP-HANDLER-DISABLE: NumSgprs: 77
+; TRAP-HANDLER-ENABLE: NumSgprs: 67
+; TRAP-HANDLER-DISABLE: NumSgprs: 83
define amdgpu_kernel void @amdhsa_trap_num_sgprs(
ptr addrspace(1) %out0, i32 %in0,
ptr addrspace(1) %out1, i32 %in1,
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index ea3f08ede2c5d..79a8e7ee6b3c6 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=AKF_HSA %s
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=ATTRIBUTOR_HSA %s
; TODO: The test contains UB which is refined by the Attributor and should be removed.
@@ -19,12 +18,6 @@ declare ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() #0
declare i64 @llvm.amdgcn.dispatch.id() #0
define void @use_workitem_id_x() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_x
-; AKF_HSA-SAME: () #[[ATTR1:[0-9]+]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_x
; ATTRIBUTOR_HSA-SAME: () #[[ATTR1:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
@@ -37,12 +30,6 @@ define void @use_workitem_id_x() #1 {
}
define void @use_workitem_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -55,12 +42,6 @@ define void @use_workitem_id_y() #1 {
}
define void @use_workitem_id_z() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_workitem_id_z
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workitem_id_z
; ATTRIBUTOR_HSA-SAME: () #[[ATTR3:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
@@ -73,12 +54,6 @@ define void @use_workitem_id_z() #1 {
}
define void @use_workgroup_id_x() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_x
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
-; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_x
; ATTRIBUTOR_HSA-SAME: () #[[ATTR4:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -91,12 +66,6 @@ define void @use_workgroup_id_x() #1 {
}
define void @use_workgroup_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR5:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -109,12 +78,6 @@ define void @use_workgroup_id_y() #1 {
}
define void @use_workgroup_id_z() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_z
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_z
; ATTRIBUTOR_HSA-SAME: () #[[ATTR6:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
@@ -127,12 +90,6 @@ define void @use_workgroup_id_z() #1 {
}
define void @use_dispatch_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[DISPATCH_PTR]], ptr addrspace(1) undef, align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR7:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
@@ -145,12 +102,6 @@ define void @use_dispatch_ptr() #1 {
}
define void @use_queue_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_queue_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[QUEUE_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
-; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[QUEUE_PTR]], ptr addrspace(1) undef, align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_queue_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR8:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[QUEUE_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
@@ -163,12 +114,6 @@ define void @use_queue_ptr() #1 {
}
define void @use_dispatch_id() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_id
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
-; AKF_HSA-NEXT: store volatile i64 [[VAL]], ptr addrspace(1) undef, align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_id
; ATTRIBUTOR_HSA-SAME: () #[[ATTR9:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i64 @llvm.amdgcn.dispatch.id()
@@ -181,14 +126,6 @@ define void @use_dispatch_id() #1 {
}
define void @use_workgroup_id_y_workgroup_id_z() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y_workgroup_id_z
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) undef, align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) undef, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_workgroup_id_y_workgroup_id_z
; ATTRIBUTOR_HSA-SAME: () #[[ATTR10:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -205,11 +142,6 @@ define void @use_workgroup_id_y_workgroup_id_z() #1 {
}
define void @func_indirect_use_workitem_id_x() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_x
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workitem_id_x()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_x
; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_x()
@@ -220,11 +152,6 @@ define void @func_indirect_use_workitem_id_x() #1 {
}
define void @kernel_indirect_use_workitem_id_x() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workitem_id_x
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workitem_id_x()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workitem_id_x
; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_x()
@@ -235,11 +162,6 @@ define void @kernel_indirect_use_workitem_id_x() #1 {
}
define void @func_indirect_use_workitem_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workitem_id_y()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_y()
@@ -250,11 +172,6 @@ define void @func_indirect_use_workitem_id_y() #1 {
}
define void @func_indirect_use_workitem_id_z() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_z
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workitem_id_z()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workitem_id_z
; ATTRIBUTOR_HSA-SAME: () #[[ATTR3]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workitem_id_z()
@@ -265,11 +182,6 @@ define void @func_indirect_use_workitem_id_z() #1 {
}
define void @func_indirect_use_workgroup_id_x() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_x
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workgroup_id_x()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_x
; ATTRIBUTOR_HSA-SAME: () #[[ATTR4]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_x()
@@ -280,11 +192,6 @@ define void @func_indirect_use_workgroup_id_x() #1 {
}
define void @kernel_indirect_use_workgroup_id_x() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workgroup_id_x
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workgroup_id_x()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kernel_indirect_use_workgroup_id_x
; ATTRIBUTOR_HSA-SAME: () #[[ATTR4]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_x()
@@ -295,11 +202,6 @@ define void @kernel_indirect_use_workgroup_id_x() #1 {
}
define void @func_indirect_use_workgroup_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workgroup_id_y()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_y()
@@ -310,11 +212,6 @@ define void @func_indirect_use_workgroup_id_y() #1 {
}
define void @func_indirect_use_workgroup_id_z() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_z
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_workgroup_id_z()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_z
; ATTRIBUTOR_HSA-SAME: () #[[ATTR6]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_workgroup_id_z()
@@ -325,11 +222,6 @@ define void @func_indirect_use_workgroup_id_z() #1 {
}
define void @func_indirect_indirect_use_workgroup_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_indirect_use_workgroup_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @func_indirect_use_workgroup_id_y()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_indirect_use_workgroup_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] {
; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_use_workgroup_id_y()
@@ -340,11 +232,6 @@ define void @func_indirect_indirect_use_workgroup_id_y() #1 {
}
define void @indirect_x2_use_workgroup_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@indirect_x2_use_workgroup_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @func_indirect_indirect_use_workgroup_id_y()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_x2_use_workgroup_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR5]] {
; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_indirect_use_workgroup_id_y()
@@ -355,11 +242,6 @@ define void @indirect_x2_use_workgroup_id_y() #1 {
}
define void @func_indirect_use_dispatch_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_dispatch_ptr()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_dispatch_ptr()
@@ -370,11 +252,6 @@ define void @func_indirect_use_dispatch_ptr() #1 {
}
define void @func_indirect_use_queue_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_queue_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_queue_ptr()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_queue_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_queue_ptr()
@@ -385,11 +262,6 @@ define void @func_indirect_use_queue_ptr() #1 {
}
define void @func_indirect_use_dispatch_id() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_id
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_dispatch_id()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_id
; ATTRIBUTOR_HSA-SAME: () #[[ATTR9]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_dispatch_id()
@@ -400,11 +272,6 @@ define void @func_indirect_use_dispatch_id() #1 {
}
define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y_workgroup_id_z
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @func_indirect_use_workgroup_id_y_workgroup_id_z()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_workgroup_id_y_workgroup_id_z
; ATTRIBUTOR_HSA-SAME: () #[[ATTR11:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @func_indirect_use_workgroup_id_y_workgroup_id_z()
@@ -415,13 +282,6 @@ define void @func_indirect_use_workgroup_id_y_workgroup_id_z() #1 {
}
define void @recursive_use_workitem_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@recursive_use_workitem_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; AKF_HSA-NEXT: store volatile i32 [[VAL]], ptr addrspace(1) undef, align 4
-; AKF_HSA-NEXT: call void @recursive_use_workitem_id_y()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@recursive_use_workitem_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -436,11 +296,6 @@ define void @recursive_use_workitem_id_y() #1 {
}
define void @call_recursive_use_workitem_id_y() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@call_recursive_use_workitem_id_y
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @recursive_use_workitem_id_y()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@call_recursive_use_workitem_id_y
; ATTRIBUTOR_HSA-SAME: () #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: call void @recursive_use_workitem_id_y()
@@ -451,12 +306,6 @@ define void @call_recursive_use_workitem_id_y() #1 {
}
define void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
-; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
-; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
@@ -470,12 +319,6 @@ define void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 {
define void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) %ptr) #2 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_gfx9
-; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
-; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
-; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_gfx9
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
@@ -488,13 +331,6 @@ define void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) %ptr) #2 {
}
define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) %ptr) #2 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_queue_ptr_gfx9
-; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR2]] {
-; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
-; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(4) [[STOF]], align 4
-; AKF_HSA-NEXT: call void @func_indirect_use_queue_ptr()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast_queue_ptr_gfx9
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR14:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr addrspace(4)
@@ -509,11 +345,6 @@ define void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) %pt
}
define void @indirect_use_group_to_flat_addrspacecast() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast(ptr addrspace(3) null)
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast(ptr addrspace(3) null)
@@ -524,11 +355,6 @@ define void @indirect_use_group_to_flat_addrspacecast() #1 {
}
define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null)
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_gfx9
; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast_gfx9(ptr addrspace(3) null)
@@ -539,11 +365,6 @@ define void @indirect_use_group_to_flat_addrspacecast_gfx9() #1 {
}
define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null)
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9
; ATTRIBUTOR_HSA-SAME: () #[[ATTR8]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_group_to_flat_addrspacecast_queue_ptr_gfx9(ptr addrspace(3) null)
@@ -554,12 +375,6 @@ define void @indirect_use_group_to_flat_addrspacecast_queue_ptr_gfx9() #1 {
}
define void @use_kernarg_segment_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_kernarg_segment_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
-; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[KERNARG_SEGMENT_PTR]], ptr addrspace(1) undef, align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_kernarg_segment_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] {
; ATTRIBUTOR_HSA-NEXT: [[KERNARG_SEGMENT_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.kernarg.segment.ptr()
@@ -571,11 +386,6 @@ define void @use_kernarg_segment_ptr() #1 {
ret void
}
define void @func_indirect_use_kernarg_segment_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_kernarg_segment_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_kernarg_segment_ptr()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_kernarg_segment_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR11]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_kernarg_segment_ptr()
@@ -586,12 +396,6 @@ define void @func_indirect_use_kernarg_segment_ptr() #1 {
}
define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_use_implicitarg_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@@ -604,12 +408,6 @@ define amdgpu_kernel void @kern_use_implicitarg_ptr() #1 {
}
define void @use_implicitarg_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
-; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[IMPLICITARG_PTR]], ptr addrspace(1) undef, align 8
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_implicitarg_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: [[IMPLICITARG_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
@@ -622,11 +420,6 @@ define void @use_implicitarg_ptr() #1 {
}
define void @func_indirect_use_implicitarg_ptr() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: call void @use_implicitarg_ptr()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_implicitarg_ptr
; ATTRIBUTOR_HSA-SAME: () #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: call void @use_implicitarg_ptr()
@@ -640,10 +433,6 @@ declare void @external.func() #3
; This function gets deleted.
define internal void @defined.func() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@defined.func
-; AKF_HSA-SAME: () #[[ATTR3:[0-9]+]] {
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@defined.func
; ATTRIBUTOR_HSA-SAME: () #[[ATTR16:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: ret void
@@ -652,11 +441,6 @@ define internal void @defined.func() #3 {
}
define void @func_call_external() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_call_external
-; AKF_HSA-SAME: () #[[ATTR3]] {
-; AKF_HSA-NEXT: call void @external.func()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_external
; ATTRIBUTOR_HSA-SAME: () #[[ATTR15:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @external.func()
@@ -667,11 +451,6 @@ define void @func_call_external() #3 {
}
define void @func_call_defined() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_call_defined
-; AKF_HSA-SAME: () #[[ATTR3]] {
-; AKF_HSA-NEXT: call void @defined.func()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_defined
; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] {
; ATTRIBUTOR_HSA-NEXT: call void @defined.func()
@@ -681,11 +460,6 @@ define void @func_call_defined() #3 {
ret void
}
define void @func_call_asm() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_call_asm
-; AKF_HSA-SAME: () #[[ATTR3]] {
-; AKF_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR3]]
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_call_asm
; ATTRIBUTOR_HSA-SAME: () #[[ATTR16]] {
; ATTRIBUTOR_HSA-NEXT: call void asm sideeffect "", ""() #[[ATTR26:[0-9]+]]
@@ -696,11 +470,6 @@ define void @func_call_asm() #3 {
}
define amdgpu_kernel void @kern_call_external() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_external
-; AKF_HSA-SAME: () #[[ATTR4:[0-9]+]] {
-; AKF_HSA-NEXT: call void @external.func()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_external
; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: call void @external.func()
@@ -711,11 +480,6 @@ define amdgpu_kernel void @kern_call_external() #3 {
}
define amdgpu_kernel void @func_kern_defined() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_kern_defined
-; AKF_HSA-SAME: () #[[ATTR4]] {
-; AKF_HSA-NEXT: call void @defined.func()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_kern_defined
; ATTRIBUTOR_HSA-SAME: () #[[ATTR17:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @defined.func()
@@ -726,12 +490,6 @@ define amdgpu_kernel void @func_kern_defined() #3 {
}
define i32 @use_dispatch_ptr_ret_type() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr_ret_type
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; AKF_HSA-NEXT: store volatile ptr addrspace(4) [[DISPATCH_PTR]], ptr addrspace(1) undef, align 8
-; AKF_HSA-NEXT: ret i32 0
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr_ret_type
; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] {
; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
@@ -744,12 +502,6 @@ define i32 @use_dispatch_ptr_ret_type() #1 {
}
define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func
-; AKF_HSA-SAME: () #[[ATTR1]] {
-; AKF_HSA-NEXT: [[F:%.*]] = call float @use_dispatch_ptr_ret_type()
-; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
-; AKF_HSA-NEXT: ret float [[FADD]]
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_use_dispatch_ptr_constexpr_cast_func
; ATTRIBUTOR_HSA-SAME: () #[[ATTR7]] {
; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @use_dispatch_ptr_ret_type()
@@ -762,12 +514,6 @@ define float @func_indirect_use_dispatch_ptr_constexpr_cast_func() #1 {
}
define float @func_indirect_call(ptr %fptr) #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_call
-; AKF_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR3]] {
-; AKF_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]()
-; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
-; AKF_HSA-NEXT: ret float [[FADD]]
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_call
; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float [[FPTR]]()
@@ -781,12 +527,6 @@ define float @func_indirect_call(ptr %fptr) #3 {
declare float @extern() #3
define float @func_extern_call() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_extern_call
-; AKF_HSA-SAME: () #[[ATTR3]] {
-; AKF_HSA-NEXT: [[F:%.*]] = call float @extern()
-; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
-; AKF_HSA-NEXT: ret float [[FADD]]
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_extern_call
; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @extern()
@@ -799,12 +539,6 @@ define float @func_extern_call() #3 {
}
define float @func_null_call(ptr %fptr) #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_null_call
-; AKF_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR3]] {
-; AKF_HSA-NEXT: [[F:%.*]] = call float null()
-; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
-; AKF_HSA-NEXT: ret float [[FADD]]
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_null_call
; ATTRIBUTOR_HSA-SAME: (ptr [[FPTR:%.*]]) #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float null()
@@ -820,12 +554,6 @@ declare float @llvm.amdgcn.rcp.f32(float) #0
; Calls some other recognized intrinsic
define float @func_other_intrinsic_call(float %arg) #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call
-; AKF_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR3]] {
-; AKF_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]])
-; AKF_HSA-NEXT: [[FADD:%.*]] = fadd float [[F]], 1.000000e+00
-; AKF_HSA-NEXT: ret float [[FADD]]
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_other_intrinsic_call
; ATTRIBUTOR_HSA-SAME: (float [[ARG:%.*]]) #[[ATTR16]] {
; ATTRIBUTOR_HSA-NEXT: [[F:%.*]] = call float @llvm.amdgcn.rcp.f32(float [[ARG]])
@@ -839,11 +567,6 @@ define float @func_other_intrinsic_call(float %arg) #3 {
; Hostcall needs to be enabled for sanitizers
define amdgpu_kernel void @kern_sanitize_address() #4 {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address
-; AKF_HSA-SAME: () #[[ATTR5:[0-9]+]] {
-; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_sanitize_address
; ATTRIBUTOR_HSA-SAME: () #[[ATTR18:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4
@@ -855,11 +578,6 @@ define amdgpu_kernel void @kern_sanitize_address() #4 {
; Hostcall needs to be enabled for sanitizers
define void @func_sanitize_address() #4 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_sanitize_address
-; AKF_HSA-SAME: () #[[ATTR5]] {
-; AKF_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_sanitize_address
; ATTRIBUTOR_HSA-SAME: () #[[ATTR18]] {
; ATTRIBUTOR_HSA-NEXT: store volatile i32 0, ptr addrspace(1) null, align 4
@@ -871,11 +589,6 @@ define void @func_sanitize_address() #4 {
; Hostcall needs to be enabled for sanitizers
define void @func_indirect_sanitize_address() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address
-; AKF_HSA-SAME: () #[[ATTR3]] {
-; AKF_HSA-NEXT: call void @func_sanitize_address()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@func_indirect_sanitize_address
; ATTRIBUTOR_HSA-SAME: () #[[ATTR19:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address()
@@ -887,11 +600,6 @@ define void @func_indirect_sanitize_address() #3 {
; Hostcall needs to be enabled for sanitizers
define amdgpu_kernel void @kern_indirect_sanitize_address() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address
-; AKF_HSA-SAME: () #[[ATTR4]] {
-; AKF_HSA-NEXT: call void @func_sanitize_address()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_indirect_sanitize_address
; ATTRIBUTOR_HSA-SAME: () #[[ATTR19]] {
; ATTRIBUTOR_HSA-NEXT: call void @func_sanitize_address()
@@ -906,11 +614,6 @@ define amdgpu_kernel void @kern_indirect_sanitize_address() #3 {
declare void @extern_func_sanitize_address() #5
define amdgpu_kernel void @kern_decl_sanitize_address() #3 {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address
-; AKF_HSA-SAME: () #[[ATTR4]] {
-; AKF_HSA-NEXT: call void @extern_func_sanitize_address()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_decl_sanitize_address
; ATTRIBUTOR_HSA-SAME: () #[[ATTR15]] {
; ATTRIBUTOR_HSA-NEXT: call void @extern_func_sanitize_address()
@@ -923,10 +626,6 @@ define amdgpu_kernel void @kern_decl_sanitize_address() #3 {
declare void @enqueue_block_decl() #6
define internal void @enqueue_block_def() #6 {
-; AKF_HSA-LABEL: define {{[^@]+}}@enqueue_block_def
-; AKF_HSA-SAME: () #[[ATTR7:[0-9]+]] {
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@enqueue_block_def
; ATTRIBUTOR_HSA-SAME: () #[[ATTR22:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: ret void
@@ -935,11 +634,6 @@ define internal void @enqueue_block_def() #6 {
}
define amdgpu_kernel void @kern_call_enqueued_block_decl() {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl
-; AKF_HSA-SAME: () #[[ATTR8:[0-9]+]] {
-; AKF_HSA-NEXT: call void @enqueue_block_decl()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_decl
; ATTRIBUTOR_HSA-SAME: () #[[ATTR23:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_decl()
@@ -950,11 +644,6 @@ define amdgpu_kernel void @kern_call_enqueued_block_decl() {
}
define amdgpu_kernel void @kern_call_enqueued_block_def() {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def
-; AKF_HSA-SAME: () #[[ATTR8]] {
-; AKF_HSA-NEXT: call void @enqueue_block_def()
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_call_enqueued_block_def
; ATTRIBUTOR_HSA-SAME: () #[[ATTR24:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: call void @enqueue_block_def()
@@ -965,9 +654,6 @@ define amdgpu_kernel void @kern_call_enqueued_block_def() {
}
define void @unused_enqueue_block() {
-; AKF_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block() {
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@unused_enqueue_block
; ATTRIBUTOR_HSA-SAME: () #[[ATTR25:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: ret void
@@ -976,9 +662,6 @@ define void @unused_enqueue_block() {
}
define internal void @known_func() {
-; AKF_HSA-LABEL: define {{[^@]+}}@known_func() {
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@known_func
; ATTRIBUTOR_HSA-SAME: () #[[ATTR25]] {
; ATTRIBUTOR_HSA-NEXT: ret void
@@ -988,11 +671,6 @@ define internal void @known_func() {
; Should never happen
define amdgpu_kernel void @kern_callsite_enqueue_block() {
-; AKF_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block
-; AKF_HSA-SAME: () #[[ATTR8]] {
-; AKF_HSA-NEXT: call void @known_func() #[[ATTR7]]
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@kern_callsite_enqueue_block
; ATTRIBUTOR_HSA-SAME: () #[[ATTR24]] {
; ATTRIBUTOR_HSA-NEXT: call void @known_func() #[[ATTR27:[0-9]+]]
@@ -1014,15 +692,6 @@ attributes #6 = { "enqueued-block" }
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
-; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; AKF_HSA: attributes #[[ATTR1]] = { nounwind "target-cpu"="fiji" }
-; AKF_HSA: attributes #[[ATTR2]] = { nounwind "target-cpu"="gfx900" }
-; AKF_HSA: attributes #[[ATTR3]] = { nounwind }
-; AKF_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-calls" }
-; AKF_HSA: attributes #[[ATTR5]] = { nounwind sanitize_address }
-; AKF_HSA: attributes #[[ATTR6:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" }
-; AKF_HSA: attributes #[[ATTR7]] = { "enqueued-block" }
-; AKF_HSA: attributes #[[ATTR8]] = { "amdgpu-calls" }
;.
; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 6896ac8d2e5db..2aad55643707e 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=HSA,AKF_HSA %s
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -S -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=HSA,ATTRIBUTOR_HSA %s
target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
@@ -33,12 +32,6 @@ define amdgpu_kernel void @use_tgid_x(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_y
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_y
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -51,14 +44,6 @@ define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@multi_use_tgid_y
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -75,14 +60,6 @@ define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -99,12 +76,6 @@ define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_z
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_z
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
@@ -117,14 +88,6 @@ define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_z
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_z
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -141,14 +104,6 @@ define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_y_z
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_y_z
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -165,16 +120,6 @@ define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_x_y_z(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tgid_x_y_z
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
@@ -207,12 +152,6 @@ define amdgpu_kernel void @use_tidig_x(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_y
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_y
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -225,12 +164,6 @@ define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_z(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_z
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_z
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR6:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
@@ -259,14 +192,6 @@ define amdgpu_kernel void @use_tidig_x_tgid_x(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR7:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
@@ -283,16 +208,6 @@ define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_tidig_x_y_z
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR8:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
@@ -313,22 +228,6 @@ define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_all_workitems
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
-; AKF_HSA-NEXT: [[VAL1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y()
-; AKF_HSA-NEXT: [[VAL2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z()
-; AKF_HSA-NEXT: [[VAL3:%.*]] = call i32 @llvm.amdgcn.workgroup.id.x()
-; AKF_HSA-NEXT: [[VAL4:%.*]] = call i32 @llvm.amdgcn.workgroup.id.y()
-; AKF_HSA-NEXT: [[VAL5:%.*]] = call i32 @llvm.amdgcn.workgroup.id.z()
-; AKF_HSA-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_all_workitems
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR9:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[VAL0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
@@ -361,13 +260,6 @@ define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_dispatch_ptr(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_dispatch_ptr
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR10:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
@@ -382,13 +274,6 @@ define amdgpu_kernel void @use_dispatch_ptr(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_queue_ptr(ptr addrspace(1) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_queue_ptr
-; AKF_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
-; AKF_HSA-NEXT: [[VAL:%.*]] = load i32, ptr addrspace(4) [[DISPATCH_PTR]], align 4
-; AKF_HSA-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_queue_ptr
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR11:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[DISPATCH_PTR:%.*]] = call ptr addrspace(4) @llvm.amdgcn.queue.ptr()
@@ -417,12 +302,6 @@ define amdgpu_kernel void @use_kernarg_segment_ptr(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
-; AKF_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
-; AKF_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_group_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(3) [[PTR:%.*]]) #[[ATTR12:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(3) [[PTR]] to ptr
@@ -435,12 +314,6 @@ define amdgpu_kernel void @use_group_to_flat_addrspacecast(ptr addrspace(3) %ptr
}
define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast
-; AKF_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
-; AKF_HSA-NEXT: store volatile i32 0, ptr [[STOF]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_private_to_flat_addrspacecast
; ATTRIBUTOR_HSA-SAME: (ptr addrspace(5) [[PTR:%.*]]) #[[ATTR13:[0-9]+]] {
; ATTRIBUTOR_HSA-NEXT: [[STOF:%.*]] = addrspacecast ptr addrspace(5) [[PTR]] to ptr
@@ -526,13 +399,6 @@ define amdgpu_kernel void @use_flat_to_constant_addrspacecast(ptr %ptr) #1 {
}
define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_is_shared
-; AKF_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]])
-; AKF_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_SHARED]] to i32
-; AKF_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) undef, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_shared
; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]])
@@ -547,13 +413,6 @@ define amdgpu_kernel void @use_is_shared(ptr %ptr) #1 {
}
define amdgpu_kernel void @use_is_private(ptr %ptr) #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_is_private
-; AKF_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]])
-; AKF_HSA-NEXT: [[EXT:%.*]] = zext i1 [[IS_PRIVATE]] to i32
-; AKF_HSA-NEXT: store i32 [[EXT]], ptr addrspace(1) undef, align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_is_private
; ATTRIBUTOR_HSA-SAME: (ptr [[PTR:%.*]]) #[[ATTR12]] {
; ATTRIBUTOR_HSA-NEXT: [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]])
@@ -568,12 +427,6 @@ define amdgpu_kernel void @use_is_private(ptr %ptr) #1 {
}
define amdgpu_kernel void @use_alloca() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca
-; AKF_HSA-SAME: () #[[ATTR2:[0-9]+]] {
-; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
-; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca
; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
@@ -586,15 +439,6 @@ define amdgpu_kernel void @use_alloca() #1 {
}
define amdgpu_kernel void @use_alloca_non_entry_block() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block
-; AKF_HSA-SAME: () #[[ATTR2]] {
-; AKF_HSA-NEXT: entry:
-; AKF_HSA-NEXT: br label [[BB:%.*]]
-; AKF_HSA: bb:
-; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
-; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_non_entry_block
; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
; ATTRIBUTOR_HSA-NEXT: entry:
@@ -614,12 +458,6 @@ bb:
}
define void @use_alloca_func() #1 {
-; AKF_HSA-LABEL: define {{[^@]+}}@use_alloca_func
-; AKF_HSA-SAME: () #[[ATTR2]] {
-; AKF_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
-; AKF_HSA-NEXT: store i32 0, ptr addrspace(5) [[ALLOCA]], align 4
-; AKF_HSA-NEXT: ret void
-;
; ATTRIBUTOR_HSA-LABEL: define {{[^@]+}}@use_alloca_func
; ATTRIBUTOR_HSA-SAME: () #[[ATTR1]] {
; ATTRIBUTOR_HSA-NEXT: [[ALLOCA:%.*]] = alloca i32, align 4, addrspace(5)
@@ -638,9 +476,6 @@ attributes #1 = { nounwind }
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
-; AKF_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
-; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" }
;.
; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index 89fe46d975309..b241bf50242ce 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-annotate-kernel-features < %s | FileCheck -check-prefixes=CHECK,AKF_CHECK %s
; RUN: opt -S -mtriple=amdgcn-unknown-unknown -passes=amdgpu-attributor < %s | FileCheck -check-prefixes=CHECK,ATTRIBUTOR_CHECK %s
declare i32 @llvm.r600.read.tgid.x() #0
@@ -27,12 +26,6 @@ define amdgpu_kernel void @use_tgid_x(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_y
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_y
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.y()
@@ -45,14 +38,6 @@ define amdgpu_kernel void @use_tgid_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@multi_use_tgid_y
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@multi_use_tgid_y
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y()
@@ -69,14 +54,6 @@ define amdgpu_kernel void @multi_use_tgid_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x()
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR2]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x()
@@ -93,12 +70,6 @@ define amdgpu_kernel void @use_tgid_x_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_z
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.z()
-; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_z
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tgid.z()
@@ -111,14 +82,6 @@ define amdgpu_kernel void @use_tgid_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_z
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x()
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_z
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR3]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x()
@@ -135,14 +98,6 @@ define amdgpu_kernel void @use_tgid_x_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_y_z
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.z()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_y_z
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.y()
@@ -159,16 +114,6 @@ define amdgpu_kernel void @use_tgid_y_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tgid_x_y_z(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y_z
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x()
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tgid.z()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tgid_x_y_z
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR4]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tgid.x()
@@ -201,12 +146,6 @@ define amdgpu_kernel void @use_tidig_x(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_y
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.y()
-; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_y
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR5:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.y()
@@ -219,12 +158,6 @@ define amdgpu_kernel void @use_tidig_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_z(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_z
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.z()
-; AKF_CHECK-NEXT: store i32 [[VAL]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_z
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR6:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL:%.*]] = call i32 @llvm.r600.read.tidig.z()
@@ -253,14 +186,6 @@ define amdgpu_kernel void @use_tidig_x_tgid_x(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.y()
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_y_tgid_y
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR7:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.y()
@@ -277,16 +202,6 @@ define amdgpu_kernel void @use_tidig_y_tgid_y(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_tidig_x_y_z
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x()
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y()
-; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_tidig_x_y_z
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR8:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x()
@@ -307,22 +222,6 @@ define amdgpu_kernel void @use_tidig_x_y_z(ptr addrspace(1) %ptr) #1 {
}
define amdgpu_kernel void @use_all_workitems(ptr addrspace(1) %ptr) #1 {
-; AKF_CHECK-LABEL: define {{[^@]+}}@use_all_workitems
-; AKF_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR1]] {
-; AKF_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x()
-; AKF_CHECK-NEXT: [[VAL1:%.*]] = call i32 @llvm.r600.read.tidig.y()
-; AKF_CHECK-NEXT: [[VAL2:%.*]] = call i32 @llvm.r600.read.tidig.z()
-; AKF_CHECK-NEXT: [[VAL3:%.*]] = call i32 @llvm.r600.read.tgid.x()
-; AKF_CHECK-NEXT: [[VAL4:%.*]] = call i32 @llvm.r600.read.tgid.y()
-; AKF_CHECK-NEXT: [[VAL5:%.*]] = call i32 @llvm.r600.read.tgid.z()
-; AKF_CHECK-NEXT: store volatile i32 [[VAL0]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL1]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL2]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL3]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL4]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: store volatile i32 [[VAL5]], ptr addrspace(1) [[PTR]], align 4
-; AKF_CHECK-NEXT: ret void
-;
; ATTRIBUTOR_CHECK-LABEL: define {{[^@]+}}@use_all_workitems
; ATTRIBUTOR_CHECK-SAME: (ptr addrspace(1) [[PTR:%.*]]) #[[ATTR9:[0-9]+]] {
; ATTRIBUTOR_CHECK-NEXT: [[VAL0:%.*]] = call i32 @llvm.r600.read.tidig.x()
@@ -394,8 +293,6 @@ attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
;.
-; AKF_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; AKF_CHECK: attributes #[[ATTR1]] = { nounwind }
;.
; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index fc13b86566f76..22cc5af30da66 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -35,9 +35,9 @@ entry:
attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
; CHECK-LABEL: {{^}}min_1024_max_1024
-; CHECK: SGPRBlocks: 0
+; CHECK: SGPRBlocks: 2
; CHECK: VGPRBlocks: 10
-; CHECK: NumSGPRsForWavesPerEU: 2{{$}}
+; CHECK: NumSGPRsForWavesPerEU: 24{{$}}
; CHECK: NumVGPRsForWavesPerEU: 43
@var = addrspace(1) global float 0.0
define amdgpu_kernel void @min_1024_max_1024() #3 {
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index ed045107d354d..3a34aec9a2fd3 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -4,8 +4,8 @@
; ALL-LABEL: {{^}}max_10_sgprs:
-; ALL: SGPRBlocks: 1
-; ALL: NumSGPRsForWavesPerEU: 10
+; ALL: SGPRBlocks: 2
+; ALL: NumSGPRsForWavesPerEU: 24
define amdgpu_kernel void @max_10_sgprs() #0 {
%one = load volatile i32, ptr addrspace(4) undef
%two = load volatile i32, ptr addrspace(4) undef
diff --git a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 14519f5a5e77c..4507fd5865989 100644
--- a/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -116,9 +116,9 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
; Exactly 10 waves per execution unit.
; CHECK-LABEL: {{^}}exactly_10:
-; CHECK: SGPRBlocks: 2
+; CHECK: SGPRBlocks: 3
; CHECK: VGPRBlocks: 5
-; CHECK: NumSGPRsForWavesPerEU: 20
+; CHECK: NumSGPRsForWavesPerEU: 30
; CHECK: NumVGPRsForWavesPerEU: 24
define amdgpu_kernel void @exactly_10() #9 {
%val0 = load volatile float, ptr addrspace(1) @var
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
index 682a57571d11e..35f0ccf5ba62f 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-flatscratchinit-globalisel.ll
@@ -392,7 +392,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
; GFX10: argumentInfo:
; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' }
;
; GFX10: name: call_without_private_to_flat_addrspacecast
; GFX10: argumentInfo:
@@ -420,7 +421,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
; GFX10: argumentInfo:
; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' }
;
; GFX10: name: call_both_with_and_without_private_to_flat_addrspacecast
; GFX10: argumentInfo:
@@ -434,7 +436,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
; GFX10: argumentInfo:
; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' }
;
; GFX10: name: call_call_without_private_to_flat_addrspacecast
; GFX10: argumentInfo:
@@ -462,7 +465,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
; GFX10: argumentInfo:
; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' }
;
; GFX10: name: call_call_both_with_and_without_private_to_flat_addrspacecast
; GFX10: argumentInfo:
@@ -476,7 +480,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
; GFX10: argumentInfo:
; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' }
;
; GFX10: name: with_cast_call_without_private_to_flat_addrspacecast
; GFX10: argumentInfo:
@@ -490,7 +495,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
; GFX10: argumentInfo:
; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' }
;
; GFX10: name: with_cast_call_with_private_to_flat_addrspacecast
; GFX10: argumentInfo:
@@ -504,7 +510,8 @@ define amdgpu_kernel void @call_use_intrinsic_workitem_id_x_cc_kernel() {
; GFX10: argumentInfo:
; GFX10-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; GFX10-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; GFX10-NEXT: workGroupIDX: { reg: '$sgpr6' }
+; GFX10-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; GFX10-NEXT: workGroupIDX: { reg: '$sgpr8' }
;
; GFX10: name: with_indirect_call
; GFX10: argumentInfo:
diff --git a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
index 55ed11ac62972..748596d51c4ae 100644
--- a/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
+++ b/llvm/test/CodeGen/AMDGPU/attributor-noopt.ll
@@ -22,7 +22,7 @@
; NOOPT: .amdhsa_user_sgpr_queue_ptr 1
; NOOPT: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; NOOPT: .amdhsa_user_sgpr_dispatch_id 1
-; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 0
+; NOOPT: .amdhsa_user_sgpr_flat_scratch_init 1
; NOOPT: .amdhsa_user_sgpr_private_segment_size 0
; NOOPT: .amdhsa_system_sgpr_private_segment_wavefront_offset 0
; NOOPT: .amdhsa_system_sgpr_workgroup_id_x 1
diff --git a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
index 5d4db904fe6ea..7bbba44d55659 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-special-input-sgprs-fixed-abi.ll
@@ -198,11 +198,11 @@ define hidden void @use_workgroup_id_yz() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_x:
; GCN-NOT: s6
-; GCN: s_getpc_b64 s[4:5]
-; GCN-NEXT: s_add_u32 s4, s4, use_workgroup_id_x at rel32@lo+4
-; GCN-NEXT: s_addc_u32 s5, s5, use_workgroup_id_x at rel32@hi+12
+; GCN: s_getpc_b64 s[6:7]
+; GCN-NEXT: s_add_u32 s6, s6, use_workgroup_id_x at rel32@lo+4
+; GCN-NEXT: s_addc_u32 s7, s7, use_workgroup_id_x at rel32@hi+12
; GCN-NOT: s6
-; GCN: s_mov_b32 s12, s6
+; GCN: s_mov_b32 s12, s4
; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64
; GCN-NEXT: s_endpgm
@@ -217,7 +217,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_x() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_y:
; GCN-NOT: s12
-; GCN: s_mov_b32 s13, s7
+; GCN: s_mov_b32 s13, s5
; GCN-NOT: s12
; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64
@@ -233,7 +233,7 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_y() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_z:
; GCN-NOT: s12
; GCN-NOT: s13
-; GCN: s_mov_b32 s14, s7
+; GCN: s_mov_b32 s14, s5
; GCN-NOT: s12
; GCN-NOT: s13
@@ -250,8 +250,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_z() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xy:
; GCN-NOT: s14
-; GCN: s_mov_b32 s12, s6
-; GCN-NEXT: s_mov_b32 s13, s7
+; GCN: s_mov_b32 s12, s4
+; GCN-NEXT: s_mov_b32 s13, s5
; GCN-NOT: s14
; GCN: s_mov_b32 s32, 0
@@ -266,9 +266,9 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xy() #1 {
}
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xyz:
-; GCN: s_mov_b32 s12, s6
-; GCN: s_mov_b32 s13, s7
-; GCN: s_mov_b32 s14, s8
+; GCN: s_mov_b32 s12, s4
+; GCN: s_mov_b32 s13, s5
+; GCN: s_mov_b32 s14, s6
; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64
@@ -283,8 +283,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xyz() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_xz:
; GCN-NOT: s13
-; GCN: s_mov_b32 s12, s6
-; GCN-NEXT: s_mov_b32 s14, s7
+; GCN: s_mov_b32 s12, s4
+; GCN-NEXT: s_mov_b32 s14, s5
; GCN-NOT: s13
; GCN: s_mov_b32 s32, 0
@@ -300,8 +300,8 @@ define amdgpu_kernel void @kern_indirect_use_workgroup_id_xz() #1 {
; GCN-LABEL: {{^}}kern_indirect_use_workgroup_id_yz:
-; GCN: s_mov_b32 s13, s7
-; GCN: s_mov_b32 s14, s8
+; GCN: s_mov_b32 s13, s5
+; GCN: s_mov_b32 s14, s6
; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64
@@ -382,7 +382,7 @@ define hidden void @other_arg_use_workgroup_id_z(i32 %arg0) #1 {
; GCN-NOT: s13
; GCN-NOT: s14
-; GCN-DAG: s_mov_b32 s12, s6
+; GCN-DAG: s_mov_b32 s12, s4
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
; GCN-NOT: s13
; GCN-NOT: s14
@@ -400,7 +400,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_x() #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_y:
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: s_mov_b32 s13, s7
+; GCN-DAG: s_mov_b32 s13, s5
; GCN-DAG: s_mov_b32 s32, 0
; GCN: s_swappc_b64
@@ -415,7 +415,7 @@ define amdgpu_kernel void @kern_indirect_other_arg_use_workgroup_id_y() #1 {
; GCN-LABEL: {{^}}kern_indirect_other_arg_use_workgroup_id_z:
; GCN-DAG: v_mov_b32_e32 v0, 0x22b
-; GCN-DAG: s_mov_b32 s14, s7
+; GCN-DAG: s_mov_b32 s14, s5
; GCN: s_mov_b32 s32, 0
; GCN: s_swappc_b64
@@ -474,7 +474,7 @@ define hidden void @use_every_sgpr_input() #1 {
; GCN: .amdhsa_user_sgpr_queue_ptr 1
; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; GCN: .amdhsa_user_sgpr_dispatch_id 1
-; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
+; GCN: .amdhsa_user_sgpr_flat_scratch_init 0
; GCN: .amdhsa_user_sgpr_private_segment_size 0
; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; GCN: .amdhsa_system_sgpr_workgroup_id_x 1
@@ -499,7 +499,7 @@ define amdgpu_kernel void @kern_indirect_use_every_sgpr_input(i8) #1 {
; GCN: .amdhsa_user_sgpr_queue_ptr 1
; GCN: .amdhsa_user_sgpr_kernarg_segment_ptr 0
; GCN: .amdhsa_user_sgpr_dispatch_id 1
-; GCN: .amdhsa_user_sgpr_flat_scratch_init 1
+; GCN: .amdhsa_user_sgpr_flat_scratch_init 0
; GCN: .amdhsa_user_sgpr_private_segment_size 0
; GCN: .amdhsa_system_sgpr_private_segment_wavefront_offset 1
; GCN: .amdhsa_system_sgpr_workgroup_id_x 1
diff --git a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
index ee4a2ed883b63..18f1e8e1dbd4b 100644
--- a/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-object-v3.ll
@@ -12,13 +12,13 @@
; OSABI-AMDHSA-ASM: .section .rodata,"a"
; OSABI-AMDHSA-ASM: .p2align 6
; OSABI-AMDHSA-ASM: .amdhsa_kernel fadd
-; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 12
+; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 14
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 18
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 1
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
; OSABI-AMDHSA-ASM: .text
@@ -31,13 +31,13 @@
; OSABI-AMDHSA-ASM: .section .rodata,"a"
; OSABI-AMDHSA-ASM: .p2align 6
; OSABI-AMDHSA-ASM: .amdhsa_kernel fsub
-; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 12
+; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_count 14
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_private_segment_buffer 1
; OSABI-AMDHSA-ASM: .amdhsa_user_sgpr_kernarg_segment_ptr 1
; OSABI-AMDHSA-ASM: .amdhsa_next_free_vgpr 3
-; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 10
+; OSABI-AMDHSA-ASM: .amdhsa_next_free_sgpr 18
; OSABI-AMDHSA-ASM: .amdhsa_reserve_vcc 0
-; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 0
+; OSABI-AMDHSA-ASM: .amdhsa_reserve_flat_scratch 1
; OSABI-AMDHSA-ASM: .end_amdhsa_kernel
; OSABI-AMDHSA-ASM: .text
diff --git a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
index c17cf1cd6bca4..c167834470e3b 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-reg-or-const.ll
@@ -5,6 +5,9 @@
define protected amdgpu_kernel void @_Z11test_kernelPii(ptr addrspace(1) nocapture %Ad.coerce, i32 %s) local_unnamed_addr #5 {
; CHECK-LABEL: _Z11test_kernelPii:
; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-NEXT: s_add_i32 s12, s12, s17
+; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x2
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s0, 3
diff --git a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
index fcb8fa5997b7e..fc17d9288bf40 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcomb-extract-vec-elt-different-sizes.ll
@@ -6,6 +6,8 @@
define amdgpu_kernel void @eggs(i1 %arg, ptr addrspace(1) %arg1, ptr %arg2, ptr %arg3, ptr %arg4, ptr %arg5, ptr %arg6, ptr %arg7, ptr %arg8, ptr %arg9) {
; CHECK-LABEL: eggs:
; CHECK: ; %bb.0: ; %bb
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x0
; CHECK-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8
; CHECK-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index c430c41f59143..217b0ea2d9708 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -1,11 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=AKF_GCN %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s
define internal void @indirect() {
-; AKF_GCN-LABEL: define {{[^@]+}}@indirect() {
-; AKF_GCN-NEXT: ret void
-;
; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@indirect
; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] {
; ATTRIBUTOR_GCN-NEXT: ret void
@@ -14,14 +10,6 @@ define internal void @indirect() {
}
define amdgpu_kernel void @test_simple_indirect_call() #0 {
-; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call
-; AKF_GCN-SAME: () #[[ATTR0:[0-9]+]] {
-; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
-; AKF_GCN-NEXT: store ptr @indirect, ptr addrspace(5) [[FPTR]], align 8
-; AKF_GCN-NEXT: [[FP:%.*]] = load ptr, ptr addrspace(5) [[FPTR]], align 8
-; AKF_GCN-NEXT: call void [[FP]]()
-; AKF_GCN-NEXT: ret void
-;
; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call
; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] {
; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -40,7 +28,6 @@ define amdgpu_kernel void @test_simple_indirect_call() #0 {
attributes #0 = { "amdgpu-no-dispatch-id" }
;.
-; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" }
;.
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
index 9104dc68eb9b4..72913d2596ebf 100644
--- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
+++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll
@@ -9,7 +9,7 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-NEXT: s_load_dword s6, s[8:9], 0x4
-; CHECK-NEXT: s_add_u32 s24, s24, s15
+; CHECK-NEXT: s_add_u32 s24, s24, s17
; CHECK-NEXT: s_addc_u32 s25, s25, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_bitcmp1_b32 s2, 0
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index 598cdddaa53d1..edc1baf5a0980 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -94,6 +94,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
; GFX7-LABEL: s_add_co_br_user:
; GFX7: ; %bb.0: ; %bb
; GFX7-NEXT: s_load_dword s2, s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_add_i32 s0, s2, s2
; GFX7-NEXT: s_cmp_lt_u32 s0, s2
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index 39649922bd5d9..7c30bcd6828ff 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -5,6 +5,9 @@
define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v1i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -18,6 +21,9 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -32,6 +38,9 @@ define amdgpu_kernel void @extract_vector_elt_v1i8(ptr addrspace(1) %out, <1 x i
define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v2i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -54,6 +63,9 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -80,6 +92,9 @@ define amdgpu_kernel void @extract_vector_elt_v2i8(ptr addrspace(1) %out, <2 x i
define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v3i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -102,6 +117,9 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -128,6 +146,9 @@ define amdgpu_kernel void @extract_vector_elt_v3i8(ptr addrspace(1) %out, <3 x i
define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v4i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -150,6 +171,9 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -176,6 +200,9 @@ define amdgpu_kernel void @extract_vector_elt_v4i8(ptr addrspace(1) %out, <4 x i
define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v8i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s0, s[8:9], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s0, 16
@@ -192,10 +219,13 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
; VI-LABEL: extract_vector_elt_v8i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v3
@@ -213,6 +243,9 @@ define amdgpu_kernel void @extract_vector_elt_v8i8(<8 x i8> %foo) #0 {
define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v16i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x4
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -235,6 +268,9 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -261,6 +297,9 @@ define amdgpu_kernel void @extract_vector_elt_v16i8(ptr addrspace(1) %out, <16 x
define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v32i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s0, s[8:9], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s0, 16
@@ -277,10 +316,13 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
; VI-LABEL: extract_vector_elt_v32i8:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0
-; VI-NEXT: v_mov_b32_e32 v1, 0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
+; VI-NEXT: v_mov_b32_e32 v1, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_byte v[0:1], v3
@@ -298,6 +340,9 @@ define amdgpu_kernel void @extract_vector_elt_v32i8(<32 x i8> %foo) #0 {
define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x i8> %foo) #0 {
; SI-LABEL: extract_vector_elt_v64i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; SI-NEXT: s_load_dword s2, s[8:9], 0x10
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -320,6 +365,9 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x40
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -351,6 +399,9 @@ define amdgpu_kernel void @extract_vector_elt_v64i8(ptr addrspace(1) %out, <64 x
define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %foo, [8 x i32], i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v2i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s2, s[8:9], 0xa
; SI-NEXT: s_load_dword s3, s[8:9], 0x13
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -370,11 +421,14 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out
; VI-NEXT: s_load_dword s2, s[8:9], 0x4c
; VI-NEXT: s_load_dword s3, s[8:9], 0x28
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s2, s2, 3
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_lshr_b32 s2, s3, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -388,6 +442,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v2i8(ptr addrspace(1) %out
define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %foo, [8 x i32], i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v3i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dword s2, s[8:9], 0x13
; SI-NEXT: s_load_dword s3, s[8:9], 0xa
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -406,10 +463,13 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out
; VI-NEXT: s_load_dword s2, s[8:9], 0x4c
; VI-NEXT: s_load_dword s3, s[8:9], 0x28
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshl_b32 s2, s2, 3
; VI-NEXT: s_lshr_b32 s2, s3, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -424,6 +484,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(ptr addrspace(1) %out
define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, [8 x i32], i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v4i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; SI-NEXT: s_load_dword s4, s[8:9], 0xc
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -442,6 +505,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -463,6 +529,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(ptr addrspace(1) %out
define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr, i32 %idx) #0 {
; SI-LABEL: dynamic_extract_vector_elt_v8i8:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; SI-NEXT: s_load_dword s4, s[8:9], 0x4
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -481,6 +550,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -502,6 +574,9 @@ define amdgpu_kernel void @dynamic_extract_vector_elt_v8i8(ptr addrspace(1) %out
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
; SI-LABEL: reduce_load_vector_v8i8_extract_0123:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -526,6 +601,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -558,6 +636,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0123() #0 {
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
; SI-LABEL: reduce_load_vector_v8i8_extract_0145:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -581,6 +662,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s2, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -612,6 +696,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_0145() #0 {
define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
; SI-LABEL: reduce_load_vector_v8i8_extract_45:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 4
; SI-NEXT: s_load_dword s0, s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -628,6 +715,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 4
; VI-NEXT: s_load_dword s0, s[0:1], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -649,6 +739,9 @@ define amdgpu_kernel void @reduce_load_vector_v8i8_extract_45() #0 {
define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
; SI-LABEL: reduce_load_vector_v16i8_extract_0145:
; SI: ; %bb.0:
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_mov_b64 s[0:1], 0
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; SI-NEXT: s_waitcnt lgkmcnt(0)
@@ -672,6 +765,9 @@ define amdgpu_kernel void @reduce_load_vector_v16i8_extract_0145() #0 {
; VI: ; %bb.0:
; VI-NEXT: s_mov_b64 s[0:1], 0
; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s2, s0, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
index 7a81af5243ee0..49c781c3a3e3a 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -13,6 +13,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -25,6 +28,9 @@ define amdgpu_kernel void @s_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -66,6 +72,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -78,6 +87,9 @@ define amdgpu_kernel void @s_fabs_f16(ptr addrspace(1) %out, half %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -118,6 +130,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -130,6 +145,9 @@ define amdgpu_kernel void @s_fabs_v2f16(ptr addrspace(1) %out, <2 x half> %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -169,6 +187,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; CI-LABEL: s_fabs_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
@@ -182,6 +203,9 @@ define amdgpu_kernel void @s_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in) {
; VI-LABEL: s_fabs_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s3, s3, 0x7fff7fff
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
@@ -224,6 +248,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
; CI-LABEL: fabs_fold_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -241,6 +268,9 @@ define amdgpu_kernel void @fabs_fold_f16(ptr addrspace(1) %out, half %in0, half
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s3
@@ -285,6 +315,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -299,6 +332,9 @@ define amdgpu_kernel void @v_fabs_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -346,6 +382,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -358,6 +397,9 @@ define amdgpu_kernel void @fabs_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0x7fff7fff
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -401,6 +443,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
@@ -427,6 +472,9 @@ define amdgpu_kernel void @v_fabs_fold_self_v2f16(ptr addrspace(1) %out, ptr add
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -483,9 +531,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v0, v[0:1]
; CI-NEXT: s_lshr_b32 s2, s4, 16
@@ -511,9 +562,12 @@ define amdgpu_kernel void @v_fabs_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v2, v[0:1]
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -570,6 +624,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -593,6 +650,9 @@ define amdgpu_kernel void @v_extract_fabs_fold_v2f16(ptr addrspace(1) %in) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -661,6 +721,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -679,6 +742,9 @@ define amdgpu_kernel void @v_extract_fabs_no_fold_v2f16(ptr addrspace(1) %in) #0
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
index 6496b70b4d697..60334e46a4454 100644
--- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
+++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll
@@ -74,6 +74,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-LABEL: global_store_2xi16_align2:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -90,6 +93,9 @@ define amdgpu_kernel void @global_store_2xi16_align2(ptr addrspace(1) %p, ptr ad
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align2:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -216,8 +222,10 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-LABEL: global_store_2xi16_align1:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1
-; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -227,6 +235,7 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2
; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4
+; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5
; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3
; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3
@@ -243,6 +252,9 @@ define amdgpu_kernel void @global_store_2xi16_align1(ptr addrspace(1) %p, ptr ad
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align1:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -351,6 +363,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
; GFX7-ALIGNED-LABEL: global_store_2xi16_align4:
; GFX7-ALIGNED: ; %bb.0:
; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-ALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-ALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-ALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0
@@ -361,6 +376,9 @@ define amdgpu_kernel void @global_store_2xi16_align4(ptr addrspace(1) %p, ptr ad
; GFX7-UNALIGNED-LABEL: global_store_2xi16_align4:
; GFX7-UNALIGNED: ; %bb.0:
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
+; GFX7-UNALIGNED-NEXT: s_add_i32 s12, s12, s17
+; GFX7-UNALIGNED-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-UNALIGNED-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 0x20001
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index 4e12a30c6f6f4..9919497acea73 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -24,6 +24,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1
; GFX678-LABEL: v_test_canonicalize_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -76,6 +79,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dword s2, s[8:9], 0x2
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX6-NEXT: v_mov_b32_e32 v0, s0
@@ -87,6 +93,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -132,6 +141,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fabs_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -184,6 +196,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1
; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -237,6 +252,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fneg_var_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -289,6 +307,9 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou
; GFX678-LABEL: test_fold_canonicalize_undef_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -328,6 +349,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -367,6 +391,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_bfrev_b32_e32 v2, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -409,6 +436,9 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 1.0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -449,6 +479,9 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, -1.0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -489,6 +522,9 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %
; GFX678-LABEL: test_fold_canonicalize_literal_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x41800000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -529,6 +565,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -568,10 +607,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: v_mov_b32_e32 v1, s1
; GFX678-NEXT: flat_store_dword v[0:1], v2
; GFX678-NEXT: s_endpgm
@@ -612,10 +654,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: v_mov_b32_e32 v1, s1
; GFX678-NEXT: flat_store_dword v[0:1], v2
; GFX678-NEXT: s_endpgm
@@ -656,10 +701,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: s_mov_b32 s2, 0x7fffff
-; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
+; GFX678-NEXT: v_mul_f32_e64 v2, 1.0, s2
; GFX678-NEXT: v_mov_b32_e32 v1, s1
; GFX678-NEXT: flat_store_dword v[0:1], v2
; GFX678-NEXT: s_endpgm
@@ -700,6 +748,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -740,6 +791,9 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_bfrev_b32_e32 v2, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -782,6 +836,9 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x807fffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -822,6 +879,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out
; GFX678-LABEL: test_fold_canonicalize_qnan_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -862,6 +922,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -902,6 +965,9 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -942,6 +1008,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan0_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -982,6 +1051,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan1_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -1022,6 +1094,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan2_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -1062,6 +1137,9 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan3_value_f32:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
@@ -1102,6 +1180,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1
; GFX678-LABEL: v_test_canonicalize_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1153,6 +1234,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
; GFX6-LABEL: s_test_canonicalize_var_f64:
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_max_f64 v[2:3], s[2:3], s[2:3]
; GFX6-NEXT: v_mov_b32_e32 v0, s0
@@ -1163,6 +1247,9 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
; GFX8-LABEL: s_test_canonicalize_var_f64:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_max_f64 v[0:1], s[2:3], s[2:3]
; GFX8-NEXT: v_mov_b32_e32 v2, s0
@@ -1205,6 +1292,9 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fabs_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1257,6 +1347,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1
; GFX678-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1310,6 +1403,9 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou
; GFX678-LABEL: v_test_canonicalize_fneg_var_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v0, s0
; GFX678-NEXT: v_mov_b32_e32 v1, s1
@@ -1362,10 +1458,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, v0
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, v0
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1407,10 +1506,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1450,10 +1552,13 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_p1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x3ff00000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1491,10 +1596,13 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out)
; GFX678-LABEL: test_fold_canonicalize_n1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0xbff00000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1532,10 +1640,13 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %
; GFX678-LABEL: test_fold_canonicalize_literal_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x40300000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1573,10 +1684,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, v0
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, v0
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1618,10 +1732,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, -1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0xfffff
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1662,10 +1779,13 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr
; GFX678-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_bfrev_b32_e32 v1, 1
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1705,10 +1825,13 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad
; GFX678-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, -1
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x800fffff
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1749,10 +1872,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out
; GFX678-LABEL: test_fold_canonicalize_qnan_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1790,10 +1916,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1831,10 +1960,13 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp
; GFX678-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1872,10 +2004,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan0_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1913,10 +2048,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan1_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1954,10 +2092,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan2_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -1995,10 +2136,13 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(
; GFX678-LABEL: test_fold_canonicalize_snan3_value_f64:
; GFX678: ; %bb.0:
; GFX678-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX678-NEXT: s_add_i32 s12, s12, s17
+; GFX678-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX678-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX678-NEXT: v_mov_b32_e32 v0, 0
-; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: s_waitcnt lgkmcnt(0)
; GFX678-NEXT: v_mov_b32_e32 v3, s1
+; GFX678-NEXT: v_mov_b32_e32 v1, 0x7ff80000
; GFX678-NEXT: v_mov_b32_e32 v2, s0
; GFX678-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; GFX678-NEXT: s_endpgm
@@ -2037,6 +2181,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2054,6 +2201,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2117,6 +2267,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2134,6 +2287,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2197,6 +2353,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2215,6 +2374,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2279,6 +2441,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2302,6 +2467,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2368,6 +2536,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2385,6 +2556,9 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2448,6 +2622,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2465,6 +2642,9 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2529,6 +2709,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2547,6 +2730,9 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 1, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2612,6 +2798,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -2635,6 +2824,9 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -2700,6 +2892,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
; GFX6: ; %bb.0:
; GFX6-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX6-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX6-NEXT: s_add_i32 s12, s12, s17
+; GFX6-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX6-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: v_mov_b32_e32 v1, s1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -2717,6 +2912,9 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index fee6540f43c64..513befe6e19e5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -11,13 +11,14 @@
; ALL-LABEL: {{^}}test:
-; ALL-NOT: flat_scr
+; HSA-DEFAULT: flat_scr
+; HSA-NODEFAULT-NOT: flat_scr
; HSA-DEFAULT: flat_store_dword
; HSA-NODEFAULT: buffer_store_dword
; HSA-NOADDR64: flat_store_dword
-; HSA: .amdhsa_user_sgpr_flat_scratch_init 0
+; HSA: .amdhsa_user_sgpr_flat_scratch_init 1
; NOHSA-DEFAULT: buffer_store_dword
; NOHSA-NODEFAULT: flat_store_dword
@@ -28,6 +29,8 @@ entry:
ret void
}
+; ALL-LABEL: {{^}}test_addr64:
+
; HSA-DEFAULT: flat_store_dword
; HSA-NODEFAULT: buffer_store_dword
; HSA-NOADDR64: flat_store_dword
diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index 45223a24e021a..a59382ba20dc5 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -8,28 +8,34 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=stoney -mattr=+xnack | FileCheck -check-prefix=VI-XNACK -check-prefix=GCN %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=kaveri | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack | FileCheck -check-prefixes=VI-NOXNACK,HSA-VI-NOXNACK,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack | FileCheck -check-prefixes=VI-XNACK,HSA-VI-XNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=-xnack | FileCheck -check-prefixes=HSA-VI-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=carrizo -mattr=+xnack | FileCheck -check-prefixes=HSA-VI-XNACK,GCN %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX9-ARCH-FLAT,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=HSA-VI-XNACK,GFX9-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=GFX9-ARCH-FLAT-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=GFX9-ARCH-FLAT-XNACK,GCN %s
; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch | FileCheck -check-prefixes=GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=HSA-VI-NOXNACK,GFX10-ARCH-FLAT,GCN %s
-; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=HSA-VI-XNACK,GFX10-ARCH-FLAT,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,-xnack | FileCheck -check-prefixes=GFX10-ARCH-FLAT-NOXNACK,GCN %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -mattr=+architected-flat-scratch,+xnack | FileCheck -check-prefixes=GFX10-ARCH-FLAT-XNACK,GCN %s
; GCN-LABEL: {{^}}no_vcc_no_flat:
; NOT-HSA-CI: .amdhsa_reserve_xnack_mask
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; TotalNumSgprs: 8
; VI-NOXNACK: ; TotalNumSgprs: 8
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 8
; VI-XNACK: ; TotalNumSgprs: 12
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8
+; HSA-VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
define amdgpu_kernel void @no_vcc_no_flat() {
entry:
call void asm sideeffect "", "~{s7}"()
@@ -41,12 +47,18 @@ entry:
; NOT-HSA-CI: .amdhsa_reserve_xnack_mask
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; TotalNumSgprs: 10
; VI-NOXNACK: ; TotalNumSgprs: 10
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 10
; VI-XNACK: ; TotalNumSgprs: 12
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10
+; HSA-VI-XNACK: ; TotalNumSgprs: 12
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
define amdgpu_kernel void @vcc_no_flat() {
entry:
call void asm sideeffect "", "~{s7},~{vcc}"()
@@ -58,12 +70,18 @@ entry:
; NOT-HSA-CI: .amdhsa_reserve_xnack_mask
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; TotalNumSgprs: 12
; VI-NOXNACK: ; TotalNumSgprs: 14
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
; VI-XNACK: ; TotalNumSgprs: 14
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 8
+; HSA-VI-XNACK: ; TotalNumSgprs: 24
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 8
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 8
define amdgpu_kernel void @no_vcc_flat() {
entry:
call void asm sideeffect "", "~{s7},~{flat_scratch}"()
@@ -75,12 +93,18 @@ entry:
; NOT-HSA-CI: .amdhsa_reserve_xnack_mask
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: ; TotalNumSgprs: 12
; VI-NOXNACK: ; TotalNumSgprs: 14
+; HSA-VI-NOXNACK: ; TotalNumSgprs: 24
; VI-XNACK: ; TotalNumSgprs: 14
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 14
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 10
+; HSA-VI-XNACK: ; TotalNumSgprs: 24
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 14
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 14
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 10
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 10
define amdgpu_kernel void @vcc_flat() {
entry:
call void asm sideeffect "", "~{s7},~{vcc},~{flat_scratch}"()
@@ -95,12 +119,18 @@ entry:
; NOT-HSA-CI: .amdhsa_reserve_xnack_mask
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: NumSgprs: 4
; VI-NOXNACK: NumSgprs: 6
+; HSA-VI-NOXNACK: NumSgprs: 24
; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
+; HSA-VI-XNACK: NumSgprs: 24
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch}"()
@@ -115,9 +145,13 @@ entry:
; CI: NumSgprs: 4
; VI-NOXNACK: NumSgprs: 6
+; HSA-VI-NOXNACK: NumSgprs: 24
; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
+; HSA-VI-XNACK: NumSgprs: 24
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr_lo() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch_lo}"()
@@ -129,12 +163,18 @@ entry:
; NOT-HSA-CI: .amdhsa_reserve_xnack_mask
; HSA-VI-NOXNACK: .amdhsa_reserve_xnack_mask 0
; HSA-VI-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX9-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
+; GFX10-ARCH-FLAT-XNACK: .amdhsa_reserve_xnack_mask 1
; CI: NumSgprs: 4
; VI-NOXNACK: NumSgprs: 6
+; HSA-VI-NOXNACK: NumSgprs: 24
; VI-XNACK: NumSgprs: 6
-; GFX9-ARCH-FLAT: ; TotalNumSgprs: 6
-; GFX10-ARCH-FLAT: ; TotalNumSgprs: 0
+; HSA-VI-XNACK: NumSgprs: 24
+; GFX9-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 6
+; GFX9-ARCH-FLAT-XNACK: ; TotalNumSgprs: 6
+; GFX10-ARCH-FLAT-NOXNACK: ; TotalNumSgprs: 0
+; GFX10-ARCH-FLAT-XNACK: ; TotalNumSgprs: 0
define amdgpu_kernel void @use_flat_scr_hi() #0 {
entry:
call void asm sideeffect "; clobber ", "~{flat_scratch_hi}"()
diff --git a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index 64be9cb72a6ee..fb2448fb80744 100644
--- a/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -16,6 +16,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f32(ptr addrspace(1) %out, flo
; VI-LABEL: multiple_fadd_use_test_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_add_f32_e64 v0, s3, -1.0
; VI-NEXT: v_add_f32_e64 v1, s2, -1.0
@@ -80,8 +83,11 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f32(ptr addrspace(1) %out, flo
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-NEXT: s_load_dword s3, s[8:9], 0x2c
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_add_u32 s2, s0, 4
; VI-NEXT: v_add_f32_e64 v2, s4, s4
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -139,6 +145,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f32(ptr addrspace(1) %out, flo
; VI-LABEL: multiple_use_fadd_fmad_f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_add_u32 s4, s0, 4
@@ -194,6 +203,9 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(ptr addrspace(1) %ou
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s6, s4, 4
; VI-NEXT: v_mov_b32_e32 v0, s1
@@ -255,6 +267,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f32(ptr addrspace(1) %out, float %x, floa
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f32_e64 v0, s2, -4.0
; VI-NEXT: v_mul_f32_e32 v2, s2, v0
@@ -303,10 +318,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f32(ptr addrspace(1) %out, float %x, floa
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 0xc0c00000
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f32_e32 v0, s2, v0
; VI-NEXT: v_mul_f32_e32 v2, s2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
@@ -350,6 +368,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
; VI-DENORM: ; %bb.0:
; VI-DENORM-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
+; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s3, s2, 16
; VI-DENORM-NEXT: v_add_f16_e64 v0, s2, -1.0
@@ -368,6 +389,9 @@ define amdgpu_kernel void @multiple_fadd_use_test_f16(ptr addrspace(1) %out, i16
; VI-FLUSH: ; %bb.0:
; VI-FLUSH-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
+; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s3, s2, 16
; VI-FLUSH-NEXT: v_add_f16_e64 v0, s2, -1.0
@@ -482,6 +506,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
; VI-DENORM: ; %bb.0:
; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
+; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3
@@ -503,6 +530,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmac_f16(ptr addrspace(1) %out, i16
; VI-FLUSH: ; %bb.0:
; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
+; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
@@ -599,6 +629,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
; VI-DENORM: ; %bb.0:
; VI-DENORM-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
+; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s3, s4, 16
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s3
@@ -620,6 +653,9 @@ define amdgpu_kernel void @multiple_use_fadd_fmad_f16(ptr addrspace(1) %out, i16
; VI-FLUSH: ; %bb.0:
; VI-FLUSH-NEXT: s_load_dword s4, s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
+; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s3, s4, 16
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s3
@@ -718,6 +754,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-DENORM-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; VI-DENORM-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; VI-DENORM-NEXT: s_load_dword s6, s[8:9], 0x8
+; VI-DENORM-NEXT: s_add_i32 s12, s12, s17
+; VI-DENORM-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-DENORM-NEXT: s_waitcnt lgkmcnt(0)
; VI-DENORM-NEXT: s_lshr_b32 s0, s0, 16
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s0
@@ -725,6 +763,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s1
; VI-DENORM-NEXT: v_fma_f16 v3, |s6|, 2.0, v0
; VI-DENORM-NEXT: v_mov_b32_e32 v0, s2
+; VI-DENORM-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-DENORM-NEXT: s_add_u32 s4, s2, 2
; VI-DENORM-NEXT: v_mov_b32_e32 v1, s3
; VI-DENORM-NEXT: s_addc_u32 s5, s3, 0
@@ -741,6 +780,8 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-FLUSH-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; VI-FLUSH-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
; VI-FLUSH-NEXT: s_load_dword s6, s[8:9], 0x8
+; VI-FLUSH-NEXT: s_add_i32 s12, s12, s17
+; VI-FLUSH-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-FLUSH-NEXT: s_waitcnt lgkmcnt(0)
; VI-FLUSH-NEXT: s_lshr_b32 s0, s0, 16
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s0
@@ -748,6 +789,7 @@ define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(ptr addrspace(1) %ou
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s1
; VI-FLUSH-NEXT: v_mad_f16 v3, |s6|, 2.0, v0
; VI-FLUSH-NEXT: v_mov_b32_e32 v0, s2
+; VI-FLUSH-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-FLUSH-NEXT: s_add_u32 s4, s2, 2
; VI-FLUSH-NEXT: v_mov_b32_e32 v1, s3
; VI-FLUSH-NEXT: s_addc_u32 s5, s3, 0
@@ -847,6 +889,9 @@ define amdgpu_kernel void @fmul_x2_xn2_f16(ptr addrspace(1) %out, i16 zeroext %x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f16_e64 v0, s2, -4.0
; VI-NEXT: v_mul_f16_e32 v2, s2, v0
@@ -898,10 +943,13 @@ define amdgpu_kernel void @fmul_x2_xn3_f16(ptr addrspace(1) %out, i16 zeroext %x
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: v_mov_b32_e32 v0, 0xc600
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mul_f16_e32 v0, s2, v0
; VI-NEXT: v_mul_f16_e32 v2, s2, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_short v[0:1], v2
; VI-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index d6f6d440f9a83..a0179a1cae1e2 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -8,6 +8,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
; CI-LABEL: fneg_fabs_fadd_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -25,6 +28,9 @@ define amdgpu_kernel void @fneg_fabs_fadd_f16(ptr addrspace(1) %out, half %x, ha
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -69,6 +75,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
; CI-LABEL: fneg_fabs_fmul_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s1, s0, 0x7fff
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -87,6 +96,9 @@ define amdgpu_kernel void @fneg_fabs_fmul_f16(ptr addrspace(1) %out, half %x, ha
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s2
@@ -135,6 +147,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_bitset1_b32 s2, 15
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -147,6 +162,9 @@ define amdgpu_kernel void @fneg_fabs_free_f16(ptr addrspace(1) %out, i16 %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s2, 15
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -189,6 +207,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_bitset1_b32 s2, 15
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -201,6 +222,9 @@ define amdgpu_kernel void @fneg_fabs_f16(ptr addrspace(1) %out, half %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitset1_b32 s2, 15
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -241,6 +265,9 @@ define amdgpu_kernel void @v_fneg_fabs_f16(ptr addrspace(1) %out, ptr addrspace(
; CIVI-LABEL: v_fneg_fabs_f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -284,6 +311,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
; CI-LABEL: s_fneg_fabs_v2f16_non_bc_src:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s1, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v1, s1
@@ -306,7 +336,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0x4000
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v2, s3
@@ -315,6 +347,7 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_non_bc_src(ptr addrspace(1) %out, <
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_or_b32_e32 v2, 0x80008000, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -357,6 +390,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_or_b32 s2, s2, 0x80008000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -369,6 +405,9 @@ define amdgpu_kernel void @s_fneg_fabs_v2f16_bc_src(ptr addrspace(1) %out, <2 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_or_b32 s2, s2, 0x80008000
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -409,6 +448,9 @@ define amdgpu_kernel void @fneg_fabs_v4f16(ptr addrspace(1) %out, <4 x half> %in
; CIVI-LABEL: fneg_fabs_v4f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_or_b32 s3, s3, 0x80008000
; CIVI-NEXT: s_or_b32 s2, s2, 0x80008000
@@ -452,6 +494,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
; CI-LABEL: fold_user_fneg_fabs_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s1, s0, 16
; CI-NEXT: v_cvt_f32_f16_e64 v1, |s1|
@@ -473,7 +518,9 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v0, 0xc400
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v2, s3
@@ -481,6 +528,7 @@ define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(ptr addrspace(1) %out, <2 x
; VI-NEXT: v_mul_f16_sdwa v0, |v2|, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_or_b32_e32 v2, v1, v0
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: flat_store_dword v[0:1], v2
; VI-NEXT: s_endpgm
@@ -520,6 +568,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
@@ -537,6 +588,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(ptr addrspace(1) %out0, p
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
@@ -591,6 +645,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: s_bfe_u32 s0, s4, 0xf0010
@@ -615,7 +672,9 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
; VI-NEXT: v_mov_b32_e32 v5, 0xc400
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: s_lshr_b32 s1, s4, 16
@@ -624,6 +683,7 @@ define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(ptr addrspac
; VI-NEXT: s_and_b32 s0, s4, 0x7fff7fff
; VI-NEXT: v_mul_f16_sdwa v4, |v4|, v5 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
; VI-NEXT: v_mul_f16_e64 v5, |s4|, -4.0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_or_b32_e32 v4, v5, v4
; VI-NEXT: v_mov_b32_e32 v5, s0
; VI-NEXT: v_mov_b32_e32 v2, s2
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
index e1791daa3aa0c..50d8580b6af79 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll
@@ -1477,6 +1477,8 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x4
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x6
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_bitcmp1_b32 s6, 0
; GFX7-NEXT: s_cselect_b64 vcc, -1, 0
@@ -1488,6 +1490,7 @@ define amdgpu_kernel void @multiple_uses_fneg_select_f64(double %x, double %y, i
; GFX7-NEXT: s_cselect_b32 s0, s0, s2
; GFX7-NEXT: v_mov_b32_e32 v1, s1
; GFX7-NEXT: v_mov_b32_e32 v2, s4
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: v_cndmask_b32_e64 v1, v1, -v0, vcc
; GFX7-NEXT: v_mov_b32_e32 v0, s0
; GFX7-NEXT: v_mov_b32_e32 v3, s5
diff --git a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
index 16150da4063e6..81e8dcd070c55 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -10,6 +10,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x8000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -22,6 +25,9 @@ define amdgpu_kernel void @s_fneg_f16(ptr addrspace(1) %out, half %in) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -64,6 +70,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; CI-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -78,6 +87,9 @@ define amdgpu_kernel void @v_fneg_f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 1, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -125,6 +137,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x8000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -137,6 +152,9 @@ define amdgpu_kernel void @s_fneg_free_f16(ptr addrspace(1) %out, i16 %in) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x8000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -177,6 +195,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
; CI-LABEL: v_fneg_fold_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -194,6 +215,9 @@ define amdgpu_kernel void @v_fneg_fold_f16(ptr addrspace(1) %out, ptr addrspace(
; GFX8-LABEL: v_fneg_fold_f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -238,6 +262,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x80008000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -250,6 +277,9 @@ define amdgpu_kernel void @s_fneg_v2f16(ptr addrspace(1) %out, <2 x half> %in) #
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -289,14 +319,17 @@ define amdgpu_kernel void @s_fneg_v2f16_nonload(ptr addrspace(1) %out) #0 {
; CIVI-LABEL: s_fneg_v2f16_nonload:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
; CIVI-NEXT: ;;#ASMSTART
; CIVI-NEXT: ; def s2
; CIVI-NEXT: ;;#ASMEND
; CIVI-NEXT: s_xor_b32 s2, s2, 0x80008000
-; CIVI-NEXT: v_mov_b32_e32 v2, s2
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: v_mov_b32_e32 v1, s1
+; CIVI-NEXT: v_mov_b32_e32 v2, s2
; CIVI-NEXT: flat_store_dword v[0:1], v2
; CIVI-NEXT: s_endpgm
;
@@ -337,6 +370,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; CI-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v0
@@ -351,6 +387,9 @@ define amdgpu_kernel void @v_fneg_v2f16(ptr addrspace(1) %out, ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -398,6 +437,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_xor_b32 s2, s2, 0x80008000
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -410,6 +452,9 @@ define amdgpu_kernel void @fneg_free_v2f16(ptr addrspace(1) %out, i32 %in) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dword s2, s[8:9], 0x8
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_xor_b32 s2, s2, 0x80008000
; GFX8-NEXT: v_mov_b32_e32 v0, s0
@@ -450,6 +495,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; CI-LABEL: v_fneg_fold_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -476,6 +524,9 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(ptr addrspace(1) %out, ptr addrspac
; GFX8-LABEL: v_fneg_fold_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
@@ -521,6 +572,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
; CI-LABEL: v_extract_fneg_fold_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -542,6 +596,9 @@ define amdgpu_kernel void @v_extract_fneg_fold_v2f16(ptr addrspace(1) %in) #0 {
; GFX8-LABEL: v_extract_fneg_fold_v2f16:
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -604,6 +661,9 @@ define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(ptr addrspace(1) %in) #0
; CIVI-LABEL: v_extract_fneg_no_fold_v2f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll
index a2fca33af1046..10573aad38a51 100644
--- a/llvm/test/CodeGen/AMDGPU/half.ll
+++ b/llvm/test/CodeGen/AMDGPU/half.ll
@@ -10,6 +10,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -21,6 +24,9 @@ define amdgpu_kernel void @load_f16_arg(ptr addrspace(1) %out, half %arg) #0 {
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -46,6 +52,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -57,6 +66,9 @@ define amdgpu_kernel void @load_v2f16_arg(ptr addrspace(1) %out, <2 x half> %arg
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -81,6 +93,9 @@ define amdgpu_kernel void @load_v3f16_arg(ptr addrspace(1) %out, <3 x half> %arg
; CIVI-LABEL: load_v3f16_arg:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_add_u32 s4, s0, 4
; CIVI-NEXT: s_addc_u32 s5, s1, 0
@@ -114,6 +129,9 @@ define amdgpu_kernel void @load_v4f16_arg(ptr addrspace(1) %out, <4 x half> %arg
; CIVI-LABEL: load_v4f16_arg:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v2, s2
@@ -139,6 +157,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v4, s4
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -153,6 +174,9 @@ define amdgpu_kernel void @load_v8f16_arg(ptr addrspace(1) %out, <8 x half> %arg
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v4, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -183,6 +207,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s3, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -196,6 +223,9 @@ define amdgpu_kernel void @extload_v2f16_arg(ptr addrspace(1) %out, <2 x half> %
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -227,6 +257,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v2, s2
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -238,6 +271,9 @@ define amdgpu_kernel void @extload_f16_to_f32_arg(ptr addrspace(1) %out, half %a
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v2, s2
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -265,6 +301,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s3, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -278,6 +317,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(ptr addrspace(1) %out, <2
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
@@ -308,6 +350,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
; CI-LABEL: extload_v3f16_to_v3f32_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v2, s3
@@ -321,6 +366,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(ptr addrspace(1) %out, <3
; VI-LABEL: extload_v3f16_to_v3f32_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s2, 16
; VI-NEXT: v_cvt_f32_f16_e32 v0, s2
@@ -351,6 +399,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
; CI-LABEL: extload_v4f16_to_v4f32_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s3, 16
; CI-NEXT: s_lshr_b32 s5, s2, 16
@@ -366,6 +417,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(ptr addrspace(1) %out, <4
; VI-LABEL: extload_v4f16_to_v4f32_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s3, 16
; VI-NEXT: s_lshr_b32 s5, s2, 16
@@ -401,6 +455,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s6, s1, 16
; CI-NEXT: s_lshr_b32 s7, s0, 16
@@ -429,6 +486,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(ptr addrspace(1) %out, <8
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s6, s1, 16
; VI-NEXT: s_lshr_b32 s7, s0, 16
@@ -485,6 +545,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
; CI-LABEL: extload_f16_to_f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -498,6 +561,9 @@ define amdgpu_kernel void @extload_f16_to_f64_arg(ptr addrspace(1) %out, half %a
; VI-LABEL: extload_f16_to_f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v0, s0
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -529,6 +595,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
; CI-LABEL: extload_v2f16_to_v2f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s1, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s1
@@ -545,6 +614,9 @@ define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(ptr addrspace(1) %out, <2
; VI-LABEL: extload_v2f16_to_v2f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s0, s[8:9], 0x8
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s1, s0, 16
; VI-NEXT: v_cvt_f32_f16_e32 v0, s1
@@ -582,6 +654,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
; CI-LABEL: extload_v3f16_to_v3f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
; CI-NEXT: s_lshr_b32 s4, s2, 16
@@ -603,6 +678,9 @@ define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(ptr addrspace(1) %out, <3
; VI-LABEL: extload_v3f16_to_v3f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f32_f16_e32 v1, s3
; VI-NEXT: s_lshr_b32 s4, s2, 16
@@ -648,6 +726,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
; CI-LABEL: extload_v4f16_to_v4f64_arg:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s3, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s3
@@ -673,6 +754,9 @@ define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(ptr addrspace(1) %out, <4
; VI-LABEL: extload_v4f16_to_v4f64_arg:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s5, s3, 16
; VI-NEXT: v_cvt_f32_f16_e32 v0, s3
@@ -726,6 +810,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s6, s3, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s6
@@ -773,6 +860,9 @@ define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(ptr addrspace(1) %out, <8
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s6, s0, 16
; VI-NEXT: s_lshr_b32 s8, s2, 16
@@ -858,6 +948,9 @@ define amdgpu_kernel void @global_load_store_f16(ptr addrspace(1) %out, ptr addr
; CIVI-LABEL: global_load_store_f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -886,6 +979,9 @@ define amdgpu_kernel void @global_load_store_v2f16(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: global_load_store_v2f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -914,6 +1010,9 @@ define amdgpu_kernel void @global_load_store_v4f16(ptr addrspace(1) %in, ptr add
; CIVI-LABEL: global_load_store_v4f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
@@ -942,6 +1041,9 @@ define amdgpu_kernel void @global_load_store_v8f16(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: global_load_store_v8f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -970,6 +1072,9 @@ define amdgpu_kernel void @global_extload_f16_to_f32(ptr addrspace(1) %out, ptr
; CIVI-LABEL: global_extload_f16_to_f32:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -1001,6 +1106,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v2f16_to_v2f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1017,6 +1125,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v2f16_to_v2f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1052,6 +1163,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v3f16_to_v3f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1069,6 +1183,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v3f16_to_v3f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1106,6 +1223,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v4f16_to_v4f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1125,6 +1245,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v4f16_to_v4f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1165,6 +1288,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v8f16_to_v8f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1195,6 +1321,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f32(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v8f16_to_v8f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1251,6 +1380,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; CI-LABEL: global_extload_v16f16_to_v16f32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s4, s2, 16
; CI-NEXT: v_mov_b32_e32 v5, s3
@@ -1309,6 +1441,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f32(ptr addrspace(1) %out
; VI-LABEL: global_extload_v16f16_to_v16f32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1406,6 +1541,9 @@ define amdgpu_kernel void @global_extload_f16_to_f64(ptr addrspace(1) %out, ptr
; CIVI-LABEL: global_extload_f16_to_f64:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -1440,6 +1578,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v2f16_to_v2f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1458,6 +1599,9 @@ define amdgpu_kernel void @global_extload_v2f16_to_v2f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v2f16_to_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1498,6 +1642,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v3f16_to_v3f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1523,6 +1670,9 @@ define amdgpu_kernel void @global_extload_v3f16_to_v3f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v3f16_to_v3f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1574,6 +1724,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v4f16_to_v4f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1602,6 +1755,9 @@ define amdgpu_kernel void @global_extload_v4f16_to_v4f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v4f16_to_v4f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1659,6 +1815,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
; CI-LABEL: global_extload_v8f16_to_v8f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1707,6 +1866,9 @@ define amdgpu_kernel void @global_extload_v8f16_to_v8f64(ptr addrspace(1) %out,
; VI-LABEL: global_extload_v8f16_to_v8f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -1791,6 +1953,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; CI-LABEL: global_extload_v16f16_to_v16f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -1885,6 +2050,9 @@ define amdgpu_kernel void @global_extload_v16f16_to_v16f64(ptr addrspace(1) %out
; VI-LABEL: global_extload_v16f16_to_v16f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2039,6 +2207,9 @@ define amdgpu_kernel void @global_truncstore_f32_to_f16(ptr addrspace(1) %out, p
; CIVI-LABEL: global_truncstore_f32_to_f16:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
@@ -2070,6 +2241,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v2f32_to_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2087,6 +2261,9 @@ define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v2f32_to_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2123,6 +2300,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v3f32_to_v3f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2146,6 +2326,9 @@ define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v3f32_to_v3f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2191,6 +2374,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v4f32_to_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2212,6 +2398,9 @@ define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v4f32_to_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2254,6 +2443,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
; CI-LABEL: global_truncstore_v8f32_to_v8f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2289,6 +2481,9 @@ define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(ptr addrspace(1) %ou
; VI-LABEL: global_truncstore_v8f32_to_v8f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2352,6 +2547,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
; CI-LABEL: global_truncstore_v16f32_to_v16f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_add_u32 s4, s2, 32
; CI-NEXT: s_addc_u32 s5, s3, 0
@@ -2420,6 +2618,9 @@ define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(ptr addrspace(1) %
; VI-LABEL: global_truncstore_v16f32_to_v16f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_add_u32 s4, s2, 32
; VI-NEXT: s_addc_u32 s5, s3, 0
@@ -2530,6 +2731,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
; CI-LABEL: fadd_f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s0, s[8:9], 0x2
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s0
; CI-NEXT: s_lshr_b32 s0, s0, 16
@@ -2547,6 +2751,9 @@ define amdgpu_kernel void @fadd_f16(ptr addrspace(1) %out, half %a, half %b) #0
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s3, s2, 16
; VI-NEXT: v_mov_b32_e32 v0, s3
@@ -2577,6 +2784,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
; CI-LABEL: fadd_v2f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s4, s2, 16
; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
@@ -2598,6 +2808,9 @@ define amdgpu_kernel void @fadd_v2f16(ptr addrspace(1) %out, <2 x half> %a, <2 x
; VI-LABEL: fadd_v2f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s4, s3, 16
; VI-NEXT: s_lshr_b32 s5, s2, 16
@@ -2629,6 +2842,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; CI-LABEL: fadd_v4f16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s2
; CI-NEXT: v_mov_b32_e32 v1, s3
@@ -2666,6 +2882,9 @@ define amdgpu_kernel void @fadd_v4f16(ptr addrspace(1) %out, ptr addrspace(1) %i
; VI-LABEL: fadd_v4f16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s2
; VI-NEXT: v_mov_b32_e32 v1, s3
@@ -2706,6 +2925,9 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s10, s0, 16
; CI-NEXT: v_cvt_f32_f16_e32 v4, s0
@@ -2764,6 +2986,9 @@ define amdgpu_kernel void @fadd_v8f16(ptr addrspace(1) %out, <8 x half> %a, <8 x
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s10, s7, 16
; VI-NEXT: s_lshr_b32 s11, s3, 16
@@ -2824,6 +3049,9 @@ define amdgpu_kernel void @test_bitcast_from_half(ptr addrspace(1) %in, ptr addr
; CIVI-LABEL: test_bitcast_from_half:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s0
; CIVI-NEXT: v_mov_b32_e32 v1, s1
@@ -2853,6 +3081,9 @@ define amdgpu_kernel void @test_bitcast_to_half(ptr addrspace(1) %out, ptr addrs
; CIVI-LABEL: test_bitcast_to_half:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: v_mov_b32_e32 v0, s2
; CIVI-NEXT: v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
index 8c017fa5ec263..741ea419c2a45 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa-metadata-kernel-code-props.ll
@@ -15,7 +15,8 @@
; CHECK: .max_flat_workgroup_size: 1024
; CHECK: .name: test
; CHECK: .private_segment_fixed_size: 0
-; CHECK: .sgpr_count: 10
+; GFX700: .sgpr_count: 22
+; GFX803: .sgpr_count: 24
; CHECK: .symbol: test.kd
; CHECK: .vgpr_count: {{3|6}}
; WAVE64: .wavefront_size: 64
@@ -48,8 +49,8 @@ entry:
; CHECK: .name: num_spilled_sgprs
; GFX700: .sgpr_spill_count: 10
-; GFX803: .sgpr_spill_count: 10
-; GFX900: .sgpr_spill_count: 62
+; GFX803: .sgpr_spill_count: 0
+; GFX900: .sgpr_spill_count: 0
; GFX1010: .sgpr_spill_count: 60
; CHECK: .symbol: num_spilled_sgprs.kd
define amdgpu_kernel void @num_spilled_sgprs(
diff --git a/llvm/test/CodeGen/AMDGPU/hsa.ll b/llvm/test/CodeGen/AMDGPU/hsa.ll
index 37476203fbfad..2c38e201d326f 100644
--- a/llvm/test/CodeGen/AMDGPU/hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/hsa.ll
@@ -43,7 +43,7 @@
; ELF: 00E0: 6E616D65 A673696D 706C65BB 2E707269
; ELF: 00F0: 76617465 5F736567 6D656E74 5F666978
; ELF: 0100: 65645F73 697A6500 AB2E7367 70725F63
-; ELF: 0110: 6F756E74 06B12E73 6770725F 7370696C
+; ELF: 0110: 6F756E74 0EB12E73 6770725F 7370696C
; ELF: 0120: 6C5F636F 756E7400 A72E7379 6D626F6C
; ELF: 0130: A973696D 706C652E 6B64AB2E 76677072
; ELF: 0140: 5F636F75 6E7403B1 2E766770 725F7370
@@ -59,7 +59,7 @@
; ELF: 01E0: 73696D70 6C655F6E 6F5F6B65 726E6172
; ELF: 01F0: 6773BB2E 70726976 6174655F 7365676D
; ELF: 0200: 656E745F 66697865 645F7369 7A6500AB
-; ELF: 0210: 2E736770 725F636F 756E7400 B12E7367
+; ELF: 0210: 2E736770 725F636F 756E740C B12E7367
; ELF: 0220: 70725F73 70696C6C 5F636F75 6E7400A7
; ELF: 0230: 2E73796D 626F6CB5 73696D70 6C655F6E
; ELF: 0240: 6F5F6B65 726E6172 67732E6B 64AB2E76
diff --git a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
index 3d27b5fe7f30b..c9b2d0bba93df 100644
--- a/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicit-kernarg-backend-usage.ll
@@ -12,7 +12,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V4-NEXT: s_load_dwordx2 s[2:3], s[6:7], 0x40
-; GFX8V4-NEXT: v_mov_b32_e32 v4, 1
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_lg_u32 s0, -1
; GFX8V4-NEXT: s_cselect_b32 s3, s3, 0
@@ -22,6 +24,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V4-NEXT: v_mov_b32_e32 v1, s3
; GFX8V4-NEXT: s_cselect_b32 s0, s2, 0
; GFX8V4-NEXT: s_cselect_b32 s1, s1, 0
+; GFX8V4-NEXT: v_mov_b32_e32 v4, 1
; GFX8V4-NEXT: v_mov_b32_e32 v2, s1
; GFX8V4-NEXT: v_mov_b32_e32 v3, s0
; GFX8V4-NEXT: flat_store_dword v[0:1], v4
@@ -35,7 +38,9 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V5: ; %bb.0:
; GFX8V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8V5-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0xc8
-; GFX8V5-NEXT: v_mov_b32_e32 v4, 1
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_lg_u32 s0, -1
; GFX8V5-NEXT: s_cselect_b32 s2, s2, 0
@@ -45,6 +50,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX8V5-NEXT: v_mov_b32_e32 v1, s2
; GFX8V5-NEXT: s_cselect_b32 s0, s3, 0
; GFX8V5-NEXT: s_cselect_b32 s1, s1, 0
+; GFX8V5-NEXT: v_mov_b32_e32 v4, 1
; GFX8V5-NEXT: v_mov_b32_e32 v2, s1
; GFX8V5-NEXT: v_mov_b32_e32 v3, s0
; GFX8V5-NEXT: flat_store_dword v[0:1], v4
@@ -57,9 +63,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V4-LABEL: addrspacecast:
; GFX9V4: ; %bb.0:
; GFX9V4-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V4-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9V4-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9V4-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX9V4-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX9V4-NEXT: v_mov_b32_e32 v4, 1
; GFX9V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V4-NEXT: s_cmp_lg_u32 s0, -1
; GFX9V4-NEXT: s_cselect_b32 s2, s3, 0
@@ -69,6 +76,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V4-NEXT: v_mov_b32_e32 v1, s2
; GFX9V4-NEXT: s_cselect_b32 s0, s5, 0
; GFX9V4-NEXT: s_cselect_b32 s1, s1, 0
+; GFX9V4-NEXT: v_mov_b32_e32 v4, 1
; GFX9V4-NEXT: v_mov_b32_e32 v2, s1
; GFX9V4-NEXT: v_mov_b32_e32 v3, s0
; GFX9V4-NEXT: flat_store_dword v[0:1], v4
@@ -81,9 +89,10 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V5-LABEL: addrspacecast:
; GFX9V5: ; %bb.0:
; GFX9V5-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GFX9V5-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX9V5-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX9V5-NEXT: s_mov_b64 s[2:3], src_private_base
; GFX9V5-NEXT: s_mov_b64 s[4:5], src_shared_base
-; GFX9V5-NEXT: v_mov_b32_e32 v4, 1
; GFX9V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX9V5-NEXT: s_cmp_lg_u32 s0, -1
; GFX9V5-NEXT: s_cselect_b32 s2, s3, 0
@@ -93,6 +102,7 @@ define amdgpu_kernel void @addrspacecast(ptr addrspace(5) %ptr.private, ptr addr
; GFX9V5-NEXT: v_mov_b32_e32 v1, s2
; GFX9V5-NEXT: s_cselect_b32 s0, s5, 0
; GFX9V5-NEXT: s_cselect_b32 s1, s1, 0
+; GFX9V5-NEXT: v_mov_b32_e32 v4, 1
; GFX9V5-NEXT: v_mov_b32_e32 v2, s1
; GFX9V5-NEXT: v_mov_b32_e32 v3, s0
; GFX9V5-NEXT: flat_store_dword v[0:1], v4
@@ -114,6 +124,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x40
; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -126,6 +139,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_shared(ptr %ptr) {
; GFX8V5: ; %bb.0:
; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xcc
; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -168,6 +184,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
; GFX8V4: ; %bb.0:
; GFX8V4-NEXT: s_load_dword s0, s[6:7], 0x44
; GFX8V4-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V4-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V4-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V4-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -180,6 +199,9 @@ define amdgpu_kernel void @llvm_amdgcn_is_private(ptr %ptr) {
; GFX8V5: ; %bb.0:
; GFX8V5-NEXT: s_load_dword s0, s[8:9], 0xc8
; GFX8V5-NEXT: s_load_dword s1, s[8:9], 0x4
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX8V5-NEXT: s_waitcnt lgkmcnt(0)
; GFX8V5-NEXT: s_cmp_eq_u32 s1, s0
; GFX8V5-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -263,7 +285,10 @@ define amdgpu_kernel void @llvm_debugtrap() {
define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
; GFX8V4-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V4: ; %bb.0:
+; GFX8V4-NEXT: s_add_i32 s12, s12, s17
; GFX8V4-NEXT: v_mov_b32_e32 v0, s6
+; GFX8V4-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8V4-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8V4-NEXT: v_mov_b32_e32 v1, s7
; GFX8V4-NEXT: s_add_u32 s0, s8, 8
; GFX8V4-NEXT: flat_load_ubyte v0, v[0:1] glc
@@ -288,7 +313,10 @@ define amdgpu_kernel void @llvm_amdgcn_queue_ptr(ptr addrspace(1) %ptr) {
;
; GFX8V5-LABEL: llvm_amdgcn_queue_ptr:
; GFX8V5: ; %bb.0:
+; GFX8V5-NEXT: s_add_i32 s12, s12, s17
; GFX8V5-NEXT: v_mov_b32_e32 v0, s6
+; GFX8V5-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8V5-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8V5-NEXT: v_mov_b32_e32 v1, s7
; GFX8V5-NEXT: s_add_u32 s0, s8, 8
; GFX8V5-NEXT: flat_load_ubyte v0, v[0:1] glc
diff --git a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
index 2ceaca3497ece..696ea98254086 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-asm.i128.ll
@@ -8,15 +8,15 @@
define amdgpu_kernel void @s_input_output_i128() {
; GFX908-LABEL: name: s_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %12
- ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %12
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %13
+ ; GFX908-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %13
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: s_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %10
- ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %10
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 7471114 /* regdef:SGPR_128 */, def %11
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:sgpr_128 = COPY %11
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 7471113 /* reguse:SGPR_128 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=s"()
@@ -27,15 +27,15 @@ define amdgpu_kernel void @s_input_output_i128() {
define amdgpu_kernel void @v_input_output_i128() {
; GFX908-LABEL: name: v_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %12
- ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %12
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %13
+ ; GFX908-NEXT: [[COPY:%[0-9]+]]:vreg_128 = COPY %13
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6291465 /* reguse:VReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: v_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %10
- ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %10
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %11
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:vreg_128_align2 = COPY %11
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6619145 /* reguse:VReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = tail call i128 asm sideeffect "; def $0", "=v"()
@@ -46,15 +46,15 @@ define amdgpu_kernel void @v_input_output_i128() {
define amdgpu_kernel void @a_input_output_i128() {
; GFX908-LABEL: name: a_input_output_i128
; GFX908: bb.0 (%ir-block.0):
- ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:AReg_128 */, def %12
- ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %12
+ ; GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6225930 /* regdef:AReg_128 */, def %13
+ ; GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY %13
; GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6225929 /* reguse:AReg_128 */, [[COPY]]
; GFX908-NEXT: S_ENDPGM 0
;
; GFX90A-LABEL: name: a_input_output_i128
; GFX90A: bb.0 (%ir-block.0):
- ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:AReg_128_Align2 */, def %10
- ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %10
+ ; GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6488074 /* regdef:AReg_128_Align2 */, def %11
+ ; GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY %11
; GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 6488073 /* reguse:AReg_128_Align2 */, [[COPY]]
; GFX90A-NEXT: S_ENDPGM 0
%val = call i128 asm sideeffect "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index 75db7571444bc..b51cb9df8d784 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
; VI-LABEL: s_insertelement_v2bf16_0:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -82,6 +85,9 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
; VI-LABEL: s_insertelement_v2bf16_1:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -144,6 +150,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -216,6 +225,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -286,6 +298,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -358,6 +373,9 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -435,11 +453,14 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1)
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -531,14 +552,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, s4, v0, v4
@@ -611,14 +635,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, v0, s4, v4
@@ -689,14 +716,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, s4, v1, v4
@@ -769,14 +799,17 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, v1, s4, v4
@@ -853,9 +886,12 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1)
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -948,9 +984,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
@@ -1065,9 +1104,12 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -1245,11 +1287,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -1417,11 +1462,14 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 12b4b2b372ef8..82e24ee129c5c 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -20,6 +20,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2i16_0:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -67,6 +70,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -83,6 +89,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reg(ptr addrspace(1) %out, pt
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -136,6 +145,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -156,6 +168,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(ptr addrspac
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -216,6 +231,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -231,6 +249,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -285,6 +306,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -304,6 +328,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -372,6 +399,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -394,6 +424,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -463,6 +496,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2i16_1:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -509,6 +545,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -525,6 +564,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_1_reg(ptr addrspace(1) %out, pt
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -573,6 +615,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2f16_0:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -618,6 +663,9 @@ define amdgpu_kernel void @s_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
; CIVI-LABEL: s_insertelement_v2f16_1:
; CIVI: ; %bb.0:
; CIVI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CIVI-NEXT: s_add_i32 s12, s12, s17
+; CIVI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CIVI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CIVI-NEXT: s_waitcnt lgkmcnt(0)
; CIVI-NEXT: s_load_dword s2, s[2:3], 0x0
; CIVI-NEXT: v_mov_b32_e32 v0, s0
@@ -664,6 +712,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -682,6 +733,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -738,9 +792,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_mov_b32_e32 v1, s1
@@ -757,9 +814,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
; CI-NEXT: v_mov_b32_e32 v1, s1
@@ -813,6 +873,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -831,6 +894,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -886,6 +952,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -904,6 +973,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -958,6 +1030,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -976,6 +1051,9 @@ define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1030,6 +1108,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1048,6 +1129,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1102,6 +1186,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1120,6 +1207,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1174,6 +1264,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1192,6 +1285,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1246,6 +1342,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
@@ -1264,6 +1363,9 @@ define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(ptr addrspace(1) %o
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
@@ -1324,6 +1426,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s4, s[4:5], 0x0
; VI-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1343,6 +1448,9 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out,
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1405,9 +1513,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dword v3, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -1426,9 +1537,12 @@ define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(ptr addrspace(1) %
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dword v3, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s0, v2
@@ -1491,11 +1605,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
-; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: v_mov_b32_e32 v1, s5
; VI-NEXT: v_add_u32_e32 v0, vcc, s4, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: v_mov_b32_e32 v3, s3
; VI-NEXT: flat_load_dword v4, v[0:1]
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1518,11 +1635,14 @@ define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(ptr addrspace(1) %
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
-; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: v_mov_b32_e32 v1, s5
; CI-NEXT: v_add_i32_e32 v0, vcc, s4, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: v_mov_b32_e32 v3, s3
; CI-NEXT: flat_load_dword v4, v[0:1]
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
@@ -1591,14 +1711,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, s4, v0, v4
@@ -1610,9 +1733,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_0(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -1669,14 +1795,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v0, v0, s4, v4
@@ -1688,9 +1817,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_1(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -1748,14 +1880,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x30
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, s4, v1, v4
@@ -1767,9 +1902,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_2(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0xc
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -1826,14 +1964,17 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x1000504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, v1, s4, v4
@@ -1845,9 +1986,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_3(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -1905,14 +2049,17 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
-; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2
+; VI-NEXT: v_mov_b32_e32 v4, 0x3020504
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_perm_b32 v1, s4, v1, v4
@@ -1924,9 +2071,12 @@ define amdgpu_kernel void @v_insertelement_v4i16_2(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -1989,6 +2139,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: flat_load_dword v4, v[0:1] glc
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
@@ -2016,6 +2169,9 @@ define amdgpu_kernel void @v_insertelement_v4i16_dynamic_vgpr(ptr addrspace(1) %
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: flat_load_dword v4, v[0:1] glc
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
@@ -2096,9 +2252,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v2
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -2122,9 +2281,12 @@ define amdgpu_kernel void @v_insertelement_v4f16_dynamic_sgpr(ptr addrspace(1) %
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -2191,9 +2353,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_add_u32_e32 v4, vcc, s0, v4
@@ -2211,9 +2376,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_3(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: v_mov_b32_e32 v5, s1
@@ -2271,9 +2439,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -2291,9 +2462,12 @@ define amdgpu_kernel void @v_insertelement_v8i16_6(ptr addrspace(1) %out, ptr ad
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: v_mov_b32_e32 v5, s1
@@ -2382,9 +2556,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: v_mov_b32_e32 v5, s1
@@ -2436,9 +2613,12 @@ define amdgpu_kernel void @v_insertelement_v8f16_dynamic(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: v_mov_b32_e32 v5, s1
@@ -2580,11 +2760,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -2607,9 +2790,12 @@ define amdgpu_kernel void @v_insertelement_v16f16_3(ptr addrspace(1) %out, ptr a
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s3
; CI-NEXT: v_add_i32_e32 v4, vcc, s2, v8
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[4:5]
; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v4
@@ -2682,12 +2868,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dword s4, s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
-; VI-NEXT: v_mov_b32_e32 v12, 0x3020504
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_add_u32_e32 v4, vcc, 16, v0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -2695,6 +2883,7 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; VI-NEXT: v_add_u32_e32 v8, vcc, s0, v8
; VI-NEXT: v_addc_u32_e32 v9, vcc, 0, v9, vcc
; VI-NEXT: v_add_u32_e32 v10, vcc, 16, v8
+; VI-NEXT: v_mov_b32_e32 v12, 0x3020504
; VI-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc
; VI-NEXT: s_waitcnt vmcnt(1)
; VI-NEXT: v_perm_b32 v3, s4, v3, v12
@@ -2708,11 +2897,14 @@ define amdgpu_kernel void @v_insertelement_v16i16_6(ptr addrspace(1) %out, ptr a
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dword s4, s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v4, vcc, 16, v0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; CI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -2846,11 +3038,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v8, 5, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s3
; VI-NEXT: v_add_u32_e32 v4, vcc, s2, v8
; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v0, vcc
; VI-NEXT: v_add_u32_e32 v0, vcc, 16, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v5, vcc
; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5]
@@ -2943,11 +3138,14 @@ define amdgpu_kernel void @v_insertelement_v16f16_dynamic(ptr addrspace(1) %out,
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 5, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_add_i32_e32 v2, vcc, 16, v0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc
; CI-NEXT: flat_load_dwordx4 v[7:10], v[2:3]
; CI-NEXT: flat_load_dwordx4 v[0:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
index f0609f62a9024..5dff7372ab561 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -7,6 +7,9 @@
define amdgpu_kernel void @use_group_to_global_addrspacecast(ptr addrspace(3) %ptr) {
; CHECK-LABEL: use_group_to_global_addrspacecast:
; CHECK: ; %bb.0:
+; CHECK-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-NEXT: s_add_i32 s12, s12, s17
+; CHECK-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: flat_store_dword v[0:1], v0
; CHECK-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll
index 621187100f323..55a5d50f06bbd 100644
--- a/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/invalid-cast-load-i1.ll
@@ -6,6 +6,8 @@ define amdgpu_kernel void @load_idx_idy(ptr addrspace(4) %disp, ptr %g) {
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dword s6, s[4:5], 0x4
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_lshr_b32 s4, s6, 16
diff --git a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
index 496a1c652da25..1a32953305bbc 100644
--- a/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernarg-size.ll
@@ -7,7 +7,7 @@ declare void @llvm.trap() #0
; DOORBELL-NEXT: .amdhsa_group_segment_fixed_size 0
; DOORBELL-NEXT: .amdhsa_private_segment_fixed_size 0
; DOORBELL-NEXT: .amdhsa_kernarg_size 8
-; DOORBELL-NEXT: .amdhsa_user_sgpr_count 12
+; DOORBELL-NEXT: .amdhsa_user_sgpr_count 14
; DOORBELL-NEXT: .amdhsa_user_sgpr_private_segment_buffer 1
; DOORBELL: .end_amdhsa_kernel
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index d7f54f3b8e9e2..93ddcfefd770f 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -47,11 +47,7 @@
; GCN-O0-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
; GCN-O0-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O0-NEXT: Expand reduction intrinsics
-; GCN-O0-NEXT: CallGraph Construction
-; GCN-O0-NEXT: Call Graph SCC Pass Manager
-; GCN-O0-NEXT: AMDGPU Annotate Kernel Features
-; GCN-O0-NEXT: FunctionPass Manager
-; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments
+; GCN-O0-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O0-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O0-NEXT: CallGraph Construction
; GCN-O0-NEXT: Call Graph SCC Pass Manager
@@ -232,11 +228,7 @@
; GCN-O1-NEXT: Instrument function entry/exit with calls to e.g. mcount() (post inlining)
; GCN-O1-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O1-NEXT: Expand reduction intrinsics
-; GCN-O1-NEXT: CallGraph Construction
-; GCN-O1-NEXT: Call Graph SCC Pass Manager
-; GCN-O1-NEXT: AMDGPU Annotate Kernel Features
-; GCN-O1-NEXT: FunctionPass Manager
-; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments
+; GCN-O1-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O1-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O1-NEXT: CallGraph Construction
; GCN-O1-NEXT: Call Graph SCC Pass Manager
@@ -531,11 +523,7 @@
; GCN-O1-OPTS-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O1-OPTS-NEXT: Expand reduction intrinsics
; GCN-O1-OPTS-NEXT: Early CSE
-; GCN-O1-OPTS-NEXT: CallGraph Construction
-; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager
-; GCN-O1-OPTS-NEXT: AMDGPU Annotate Kernel Features
-; GCN-O1-OPTS-NEXT: FunctionPass Manager
-; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments
+; GCN-O1-OPTS-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O1-OPTS-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O1-OPTS-NEXT: CallGraph Construction
; GCN-O1-OPTS-NEXT: Call Graph SCC Pass Manager
@@ -848,11 +836,7 @@
; GCN-O2-NEXT: Scalarize Masked Memory Intrinsics
; GCN-O2-NEXT: Expand reduction intrinsics
; GCN-O2-NEXT: Early CSE
-; GCN-O2-NEXT: CallGraph Construction
-; GCN-O2-NEXT: Call Graph SCC Pass Manager
-; GCN-O2-NEXT: AMDGPU Annotate Kernel Features
-; GCN-O2-NEXT: FunctionPass Manager
-; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments
+; GCN-O2-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O2-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O2-NEXT: CallGraph Construction
; GCN-O2-NEXT: Call Graph SCC Pass Manager
@@ -1180,11 +1164,7 @@
; GCN-O3-NEXT: Lazy Block Frequency Analysis
; GCN-O3-NEXT: Optimization Remark Emitter
; GCN-O3-NEXT: Global Value Numbering
-; GCN-O3-NEXT: CallGraph Construction
-; GCN-O3-NEXT: Call Graph SCC Pass Manager
-; GCN-O3-NEXT: AMDGPU Annotate Kernel Features
-; GCN-O3-NEXT: FunctionPass Manager
-; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments
+; GCN-O3-NEXT: AMDGPU Lower Kernel Arguments
; GCN-O3-NEXT: Lower buffer fat pointer operations to buffer resources
; GCN-O3-NEXT: CallGraph Construction
; GCN-O3-NEXT: Call Graph SCC Pass Manager
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index a585b49ef8d9a..b16e34ea14925 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -30,9 +30,12 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-SDAG-NEXT: s_load_dword s2, s[8:9], 0x32
; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CI-SDAG-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -59,10 +62,13 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x32
; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; CI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -133,6 +139,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
; CI-SDAG: ; %bb.0:
; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1
; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x32
+; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1
; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -166,6 +175,9 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x32
+; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0
; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index dc621f15709fd..bb422ed5b9457 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -63,9 +63,12 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-SDAG-NEXT: s_load_dword s2, s[8:9], 0x33
; CI-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CI-SDAG-NEXT: v_add_i32_e32 v0, vcc, s0, v0
+; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-SDAG-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-SDAG-NEXT: s_waitcnt vmcnt(0)
@@ -92,10 +95,13 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_load_dword s2, s[8:9], 0x33
; CI-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: v_mov_b32_e32 v0, s0
; CI-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CI-GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] glc
; CI-GISEL-NEXT: s_waitcnt vmcnt(0)
@@ -200,6 +206,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
; CI-SDAG: ; %bb.0:
; CI-SDAG-NEXT: s_load_dword s0, s[8:9], 0x1
; CI-SDAG-NEXT: s_load_dword s1, s[8:9], 0x33
+; CI-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CI-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CI-SDAG-NEXT: s_cmp_eq_u32 s0, s1
; CI-SDAG-NEXT: s_cselect_b64 s[0:1], -1, 0
@@ -233,6 +242,9 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
; CI-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_load_dword s0, s[8:9], 0x33
+; CI-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CI-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CI-GISEL-NEXT: s_cmp_lg_u32 s1, s0
; CI-GISEL-NEXT: s_cbranch_scc1 .LBB1_2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
index 97219a8f143ce..0fe371c1b51fe 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.lds.kernel.id.ll
@@ -23,8 +23,11 @@ define void @function_lds_id(ptr addrspace(1) %out) {
define amdgpu_kernel void @kernel_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
; GCN-LABEL: kernel_lds_id:
; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; GCN-NEXT: s_add_i32 s2, s12, 42
+; GCN-NEXT: s_add_i32 s2, s14, 42
; GCN-NEXT: v_mov_b32_e32 v2, s2
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
@@ -74,6 +77,9 @@ define amdgpu_kernel void @indirect_lds_id(ptr addrspace(1) %out) !llvm.amdgcn.l
define amdgpu_kernel void @doesnt_use_it(ptr addrspace(1) %out) !llvm.amdgcn.lds.kernel.id !0 {
; GCN-LABEL: doesnt_use_it:
; GCN: ; %bb.0:
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v2, 0x64
; GCN-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index a3bd0aabd5c3f..338c5d06797ff 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -284,6 +284,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -294,6 +297,9 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i32(ptr addrspace(1) %out
; CHECK-GISEL-LABEL: test_readfirstlane_imm_fold_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -309,10 +315,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -321,10 +330,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_i64(ptr addrspace(1) %out
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -337,10 +349,13 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
; CHECK-SDAG-LABEL: test_readfirstlane_imm_fold_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -349,11 +364,14 @@ define amdgpu_kernel void @test_readfirstlane_imm_fold_f64(ptr addrspace(1) %out
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -366,12 +384,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
; CHECK-SDAG-LABEL: test_readfirstlane_m0:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 m0, -1
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
@@ -379,12 +400,15 @@ define amdgpu_kernel void @test_readfirstlane_m0(ptr addrspace(1) %out) {
; CHECK-GISEL-LABEL: test_readfirstlane_m0:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 m0, -1
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -398,25 +422,31 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i32(ptr addrspace(1
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 s2, 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -430,13 +460,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -444,13 +477,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_i64(ptr addrspace(1
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -464,13 +500,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
; CHECK-SDAG-LABEL: test_readfirstlane_copy_from_sgpr_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -478,13 +517,16 @@ define amdgpu_kernel void @test_readfirstlane_copy_from_sgpr_f64(ptr addrspace(1
; CHECK-GISEL-LABEL: test_readfirstlane_copy_from_sgpr_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index edb6ebcee1325..f2b0959cc706e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -179,6 +179,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, 32
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -189,6 +192,9 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i32(ptr addrspace(1) %out, i32
; CHECK-GISEL-LABEL: test_readlane_imm_sreg_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, 32
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -204,10 +210,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
; CHECK-SDAG-LABEL: test_readlane_imm_sreg_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -216,10 +225,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_i64(ptr addrspace(1) %out, i32
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 32
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -232,10 +244,13 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
; CHECK-SDAG-LABEL: test_readlane_imm_sreg_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, 0
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, 0x40400000
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -244,11 +259,14 @@ define amdgpu_kernel void @test_readlane_imm_sreg_f64(ptr addrspace(1) %out, i32
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: s_mov_b32 s3, 0x40400000
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -262,6 +280,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -281,6 +302,9 @@ define amdgpu_kernel void @test_readlane_vregs_i32(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -311,6 +335,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -332,6 +359,9 @@ define amdgpu_kernel void @test_readlane_vregs_i64(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -365,6 +395,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -386,6 +419,9 @@ define amdgpu_kernel void @test_readlane_vregs_f64(ptr addrspace(1) %out, ptr ad
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CHECK-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -419,12 +455,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
; CHECK-SDAG-LABEL: test_readlane_m0_sreg:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 m0, -1
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, m0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
@@ -432,12 +471,15 @@ define amdgpu_kernel void @test_readlane_m0_sreg(ptr addrspace(1) %out, i32 %src
; CHECK-GISEL-LABEL: test_readlane_m0_sreg:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 m0, -1
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, m0
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -454,11 +496,14 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v0
; CHECK-SDAG-NEXT: ;;#ASMEND
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
; CHECK-SDAG-NEXT: v_readlane_b32 s2, v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
;
@@ -468,10 +513,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i32(ptr addrspace(1) %out) #1
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: ; def v0
; CHECK-GISEL-NEXT: ;;#ASMEND
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -485,14 +533,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v[0:1]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -505,10 +556,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_i64(ptr addrspace(1) %out) #1
; CHECK-GISEL-NEXT: ; def v[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -523,14 +577,17 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
; CHECK-SDAG-LABEL: test_readlane_vgpr_imm_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: ; def v[0:1]
; CHECK-SDAG-NEXT: ;;#ASMEND
; CHECK-SDAG-NEXT: v_readlane_b32 s2, v1, 32
; CHECK-SDAG-NEXT: v_readlane_b32 s3, v0, 32
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s2
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -543,10 +600,13 @@ define amdgpu_kernel void @test_readlane_vgpr_imm_f64(ptr addrspace(1) %out) #1
; CHECK-GISEL-NEXT: ; def v[0:1]
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_readlane_b32 s2, v0, 32
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: v_readlane_b32 s3, v1, 32
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -561,25 +621,31 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i32(ptr addrspace(1) %ou
; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i32:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b32 s2, 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s0
; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s2
; CHECK-SDAG-NEXT: flat_store_dword v[0:1], v2
; CHECK-SDAG-NEXT: s_endpgm
;
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i32:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b32 s2, 0
; CHECK-GISEL-NEXT: ;;#ASMEND
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s2
; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s1
; CHECK-GISEL-NEXT: flat_store_dword v[0:1], v2
; CHECK-GISEL-NEXT: s_endpgm
@@ -593,13 +659,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_i64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -607,13 +676,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_i64(ptr addrspace(1) %ou
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_i64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
@@ -627,13 +699,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
; CHECK-SDAG-LABEL: test_readlane_copy_from_sgpr_f64:
; CHECK-SDAG: ; %bb.0:
; CHECK-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-SDAG-NEXT: s_add_i32 s12, s12, s17
+; CHECK-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CHECK-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CHECK-SDAG-NEXT: ;;#ASMSTART
; CHECK-SDAG-NEXT: s_mov_b64 s[2:3], 0
; CHECK-SDAG-NEXT: ;;#ASMEND
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v0, s2
+; CHECK-SDAG-NEXT: v_mov_b32_e32 v1, s3
; CHECK-SDAG-NEXT: v_mov_b32_e32 v2, s0
; CHECK-SDAG-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-SDAG-NEXT: s_endpgm
@@ -641,13 +716,16 @@ define amdgpu_kernel void @test_readlane_copy_from_sgpr_f64(ptr addrspace(1) %ou
; CHECK-GISEL-LABEL: test_readlane_copy_from_sgpr_f64:
; CHECK-GISEL: ; %bb.0:
; CHECK-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CHECK-GISEL-NEXT: s_add_i32 s12, s12, s17
; CHECK-GISEL-NEXT: ;;#ASMSTART
; CHECK-GISEL-NEXT: s_mov_b64 s[2:3], 0
; CHECK-GISEL-NEXT: ;;#ASMEND
; CHECK-GISEL-NEXT: v_mov_b32_e32 v0, s2
-; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
+; CHECK-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; CHECK-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; CHECK-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CHECK-GISEL-NEXT: v_mov_b32_e32 v1, s3
; CHECK-GISEL-NEXT: v_mov_b32_e32 v2, s0
; CHECK-GISEL-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
; CHECK-GISEL-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index 6646818b7b36f..87a46c4366ea7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -15,6 +15,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
; GFX802-SDAG-LABEL: test_writelane_sreg_i32:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
@@ -53,6 +56,9 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
; GFX802-GISEL-LABEL: test_writelane_sreg_i32:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
@@ -98,6 +104,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
@@ -147,6 +156,9 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
@@ -202,6 +214,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
@@ -251,6 +266,9 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x10
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
@@ -306,6 +324,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -348,6 +369,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -396,6 +420,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
@@ -444,6 +471,9 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -498,11 +528,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: s_mov_b32 s5, 0x40400000
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
@@ -551,11 +584,14 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: s_mov_b32 s5, 0x40400000
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
@@ -609,6 +645,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -668,6 +707,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -738,6 +780,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -803,6 +848,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -877,7 +925,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: v_lshlrev_b32_e32 v0, 4, v0
-; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -886,6 +936,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-SDAG-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX802-SDAG-NEXT: flat_load_dword v2, v[0:1]
; GFX802-SDAG-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-SDAG-NEXT: s_mov_b32 s4, 0x40280000
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
@@ -946,7 +997,9 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: v_lshlrev_b32_e32 v2, 4, v0
-; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s3
@@ -956,6 +1009,7 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
; GFX802-GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; GFX802-GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; GFX802-GISEL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX802-GISEL-NEXT: s_mov_b32 s4, 0x40280000
; GFX802-GISEL-NEXT: v_mov_b32_e32 v4, s1
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s0
; GFX802-GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
@@ -1028,15 +1082,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
; GFX802-SDAG-NEXT: ;;#ASMSTART
; GFX802-SDAG-NEXT: s_mov_b32 m0, -1
; GFX802-SDAG-NEXT: ;;#ASMEND
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: s_mov_b32 s4, m0
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s3
; GFX802-SDAG-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s1
; GFX802-SDAG-NEXT: flat_store_dword v[0:1], v2
; GFX802-SDAG-NEXT: s_endpgm
;
@@ -1081,15 +1138,18 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
; GFX802-GISEL-NEXT: ;;#ASMSTART
; GFX802-GISEL-NEXT: s_mov_b32 m0, -1
; GFX802-GISEL-NEXT: ;;#ASMEND
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: s_mov_b32 s4, m0
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-GISEL-NEXT: s_mov_b32 m0, s2
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
-; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s3
; GFX802-GISEL-NEXT: v_writelane_b32 v2, s4, m0
+; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: flat_store_dword v[0:1], v2
; GFX802-GISEL-NEXT: s_endpgm
;
@@ -1138,6 +1198,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -1180,6 +1243,9 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s2, s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dword s3, s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1227,6 +1293,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
; GFX802-SDAG-LABEL: test_writelane_imm_i64:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
@@ -1270,6 +1339,9 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
; GFX802-GISEL-LABEL: test_writelane_imm_i64:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -1319,6 +1391,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
; GFX802-SDAG-LABEL: test_writelane_imm_f64:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
@@ -1362,6 +1437,9 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
; GFX802-GISEL-LABEL: test_writelane_imm_f64:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
@@ -1412,6 +1490,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s4
; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
@@ -1449,6 +1530,9 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 inreg %oldval, ptr
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s4
; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
@@ -1492,10 +1576,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0
@@ -1538,11 +1625,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 inreg %oldval, ptr
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -1589,10 +1679,13 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-SDAG-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: s_mov_b32 m0, s6
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s2
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, s3
; GFX802-SDAG-NEXT: v_writelane_b32 v3, s5, m0
@@ -1635,11 +1728,14 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double inreg %oldval,
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-GISEL-NEXT: s_load_dword s6, s[8:9], 0x18
; GFX802-GISEL-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, s1
; GFX802-GISEL-NEXT: s_mov_b32 m0, s6
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s2
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s4, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s5, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s3
@@ -1684,7 +1780,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
; GFX802-SDAG-LABEL: test_writelane_imm_oldval_i32:
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, 42
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-SDAG-NEXT: s_mov_b32 m0, s3
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, s0
@@ -1716,7 +1815,10 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
; GFX802-GISEL-LABEL: test_writelane_imm_oldval_i32:
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, 42
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s3
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, s0
@@ -1754,11 +1856,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 42
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
@@ -1797,11 +1902,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 42
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
@@ -1845,11 +1953,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
; GFX802-SDAG: ; %bb.0:
; GFX802-SDAG-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX802-SDAG-NEXT: s_load_dword s4, s[8:9], 0x10
+; GFX802-SDAG-NEXT: s_add_i32 s12, s12, s17
; GFX802-SDAG-NEXT: v_mov_b32_e32 v1, 0x40450000
; GFX802-SDAG-NEXT: v_mov_b32_e32 v0, 0
+; GFX802-SDAG-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX802-SDAG-NEXT: s_waitcnt lgkmcnt(0)
-; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: s_mov_b32 m0, s4
+; GFX802-SDAG-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX802-SDAG-NEXT: v_mov_b32_e32 v2, s0
; GFX802-SDAG-NEXT: v_mov_b32_e32 v3, s1
; GFX802-SDAG-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-SDAG-NEXT: v_writelane_b32 v0, s2, m0
@@ -1888,11 +1999,14 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
; GFX802-GISEL: ; %bb.0:
; GFX802-GISEL-NEXT: s_load_dword s4, s[8:9], 0x10
; GFX802-GISEL-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX802-GISEL-NEXT: s_add_i32 s12, s12, s17
; GFX802-GISEL-NEXT: v_mov_b32_e32 v0, 0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v1, 0x40450000
; GFX802-GISEL-NEXT: s_waitcnt lgkmcnt(0)
; GFX802-GISEL-NEXT: s_mov_b32 m0, s4
; GFX802-GISEL-NEXT: v_mov_b32_e32 v3, s1
+; GFX802-GISEL-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX802-GISEL-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX802-GISEL-NEXT: v_writelane_b32 v0, s2, m0
; GFX802-GISEL-NEXT: v_writelane_b32 v1, s3, m0
; GFX802-GISEL-NEXT: v_mov_b32_e32 v2, s0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
index 6f95364ac3644..919c1dfd4694e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_f64(ptr addrspace(1) %out, ptr addrspac
; GFX7-HSA-LABEL: constant_load_f64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -90,7 +93,10 @@ define amdgpu_kernel void @constant_load_2v4f64(ptr addrspace(4) noalias nocaptu
;
; GFX7-HSA-LABEL: constant_load_2v4f64:
; GFX7-HSA: ; %bb.0: ; %entry
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[20:21], s[18:19], 0x0
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[16:17], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
index 1af026a48b906..bd76a19a9861b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -26,6 +26,9 @@ define amdgpu_kernel void @constant_load_i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: constant_load_i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -106,6 +109,9 @@ define amdgpu_kernel void @constant_load_v2i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v2i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -177,6 +183,9 @@ define amdgpu_kernel void @constant_load_v3i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v3i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: s_add_u32 s4, s0, 4
@@ -275,6 +284,9 @@ define amdgpu_kernel void @constant_load_v4i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v4i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -349,6 +361,9 @@ define amdgpu_kernel void @constant_load_v8i16(ptr addrspace(1) %out, ptr addrsp
; GCN-HSA-LABEL: constant_load_v8i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -434,6 +449,9 @@ define amdgpu_kernel void @constant_load_v16i16(ptr addrspace(1) %out, ptr addrs
; GCN-HSA-LABEL: constant_load_v16i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GCN-HSA-NEXT: s_add_u32 s10, s8, 16
@@ -573,6 +591,9 @@ define amdgpu_kernel void @constant_load_v16i16_align2(ptr addrspace(4) %ptr0) #
; GCN-HSA-LABEL: constant_load_v16i16_align2:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1
@@ -784,6 +805,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i32(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_zextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -859,6 +883,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i32(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_sextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -935,6 +962,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1010,6 +1040,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1084,6 +1117,9 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1169,6 +1205,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1258,6 +1297,9 @@ define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1355,6 +1397,9 @@ define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1457,6 +1502,9 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1563,6 +1611,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1680,6 +1731,9 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -1838,6 +1892,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2015,6 +2072,9 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2277,6 +2337,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2584,7 +2647,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3065,7 +3131,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3633,7 +3702,10 @@ define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_zextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4549,7 +4621,10 @@ define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_sextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -5336,6 +5411,9 @@ define amdgpu_kernel void @constant_zextload_i16_to_i64(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_zextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5422,6 +5500,9 @@ define amdgpu_kernel void @constant_sextload_i16_to_i64(ptr addrspace(1) %out, p
; GCN-HSA-LABEL: constant_sextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5507,6 +5588,9 @@ define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5588,6 +5672,9 @@ define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5672,12 +5759,15 @@ define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s0, s2, 16
; GCN-HSA-NEXT: s_and_b32 s1, s2, 0xffff
@@ -5766,6 +5856,9 @@ define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -5869,10 +5962,13 @@ define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s4, s3, 16
; GCN-HSA-NEXT: s_lshr_b32 s5, s2, 16
@@ -6004,6 +6100,9 @@ define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6160,10 +6259,13 @@ define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_zextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s8, s5, 16
; GCN-HSA-NEXT: s_lshr_b32 s2, s7, 16
@@ -6378,6 +6480,9 @@ define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: constant_sextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6639,10 +6744,13 @@ define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_zextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
+; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_lshr_b32 s12, s5, 16
; GCN-HSA-NEXT: s_lshr_b32 s13, s7, 16
@@ -7024,6 +7132,9 @@ define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(ptr addrspace(1) %
; GCN-HSA-LABEL: constant_sextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx8 s[12:19], s[2:3], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -7499,7 +7610,10 @@ define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_zextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0
; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -8222,7 +8336,10 @@ define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(ptr addrspace(1) %
;
; GCN-HSA-LABEL: constant_sextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
; GCN-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 120f47a277ee6..68a6a148819e8 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -23,6 +23,9 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
; GFX7-HSA-LABEL: constant_load_i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -103,6 +106,9 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v2i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -190,6 +196,9 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v3i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -284,6 +293,9 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v4i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -383,6 +395,9 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v8i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_add_u32 s10, s8, 16
@@ -517,6 +532,9 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v9i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s12, s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -678,6 +696,9 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-LABEL: constant_load_v10i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[12:13], s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -847,6 +868,9 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-LABEL: constant_load_v11i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -1023,6 +1047,9 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
; GFX7-HSA-LABEL: constant_load_v12i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[12:15], s[10:11], 0x8
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
@@ -1202,7 +1229,10 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
;
; GFX7-HSA-LABEL: constant_load_v16i32:
; GFX7-HSA: ; %bb.0: ; %entry
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_add_u32 s18, s16, 48
@@ -1389,6 +1419,9 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
; GFX7-HSA-LABEL: constant_zextload_i32_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1473,6 +1506,9 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
; GFX7-HSA-LABEL: constant_sextload_i32_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1563,6 +1599,9 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v1i32_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, 0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
@@ -1647,6 +1686,9 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v1i32_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -1739,12 +1781,15 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v2i32_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s1
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s3
@@ -1837,6 +1882,9 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v2i32_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1949,13 +1997,16 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v4i32_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 16
; GFX7-HSA-NEXT: s_addc_u32 s3, s1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s6
@@ -2082,6 +2133,9 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v4i32_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2244,8 +2298,10 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_zextload_v8i32_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 48
@@ -2253,6 +2309,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-NEXT: v_mov_b32_e32 v5, s3
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s2
; GFX7-HSA-NEXT: s_add_u32 s2, s0, 32
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s10
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s11
@@ -2452,6 +2509,9 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX7-HSA-LABEL: constant_sextload_v8i32_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2748,7 +2808,10 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
;
; GFX7-HSA-LABEL: constant_sextload_v16i32_to_v16i64:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3196,7 +3259,10 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
;
; GFX7-HSA-LABEL: constant_zextload_v16i32_to_v16i64:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3628,7 +3694,10 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
;
; GFX7-HSA-LABEL: constant_sextload_v32i32_to_v32i64:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4479,8 +4548,10 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-LABEL: constant_zextload_v32i32_to_v32i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[36:39], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0xf0
; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
@@ -4509,6 +4580,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX7-HSA-NEXT: v_mov_b32_e32 v14, s0
; GFX7-HSA-NEXT: s_add_u32 s0, s36, 0x90
; GFX7-HSA-NEXT: s_addc_u32 s1, s37, 0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s30
; GFX7-HSA-NEXT: v_mov_b32_e32 v2, s31
@@ -5097,7 +5169,10 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
;
; GFX7-HSA-LABEL: constant_load_v32i32:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
index b3e75e767ae64..2219ceea7ec9b 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -22,6 +22,9 @@ define amdgpu_kernel void @constant_load_i64(ptr addrspace(1) %out, ptr addrspac
; GFX7-LABEL: constant_load_i64:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-NEXT: v_mov_b32_e32 v0, s0
@@ -95,6 +98,9 @@ define amdgpu_kernel void @constant_load_v2i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-LABEL: constant_load_v2i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-NEXT: v_mov_b32_e32 v4, s0
@@ -179,6 +185,9 @@ define amdgpu_kernel void @constant_load_v3i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-LABEL: constant_load_v3i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[2:3], 0x4
; GFX7-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
@@ -294,6 +303,9 @@ define amdgpu_kernel void @constant_load_v4i64(ptr addrspace(1) %out, ptr addrsp
; GFX7-LABEL: constant_load_v4i64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-NEXT: s_add_u32 s10, s8, 16
@@ -421,7 +433,10 @@ define amdgpu_kernel void @constant_load_v8i64(ptr addrspace(1) %out, ptr addrsp
;
; GFX7-LABEL: constant_load_v8i64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-NEXT: s_add_u32 s18, s16, 48
@@ -638,7 +653,10 @@ define amdgpu_kernel void @constant_load_v16i64(ptr addrspace(1) %out, ptr addrs
;
; GFX7-LABEL: constant_load_v16i64:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_add_i32 s12, s12, s17
; GFX7-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x10
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
index c608bef3f726e..4031be65fab61 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -27,6 +27,9 @@ define amdgpu_kernel void @constant_load_i8(ptr addrspace(1) %out, ptr addrspace
; GFX7-HSA-LABEL: constant_load_i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -112,6 +115,9 @@ define amdgpu_kernel void @constant_load_v2i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v2i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -195,6 +201,9 @@ define amdgpu_kernel void @constant_load_v3i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v3i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -305,6 +314,9 @@ define amdgpu_kernel void @constant_load_v4i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v4i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -374,6 +386,9 @@ define amdgpu_kernel void @constant_load_v8i8(ptr addrspace(1) %out, ptr addrspa
; GFX7-HSA-LABEL: constant_load_v8i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -448,6 +463,9 @@ define amdgpu_kernel void @constant_load_v16i8(ptr addrspace(1) %out, ptr addrsp
; GFX7-HSA-LABEL: constant_load_v16i8:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -529,6 +547,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i32(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_zextload_i8_to_i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -604,6 +625,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i32(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_sextload_i8_to_i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -680,6 +704,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -755,6 +782,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -834,6 +864,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -933,6 +966,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1030,6 +1066,9 @@ define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v3i8_to_v3i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1131,6 +1170,9 @@ define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v3i8_to_v3i32:
; GFX7-HSA: ; %bb.0: ; %entry
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v3, s0
@@ -1232,6 +1274,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1336,6 +1381,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -1453,6 +1501,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -1612,6 +1663,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -1794,6 +1848,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2060,6 +2117,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2374,6 +2434,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -2856,6 +2919,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i32:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[4:11], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -3437,7 +3503,10 @@ define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(ptr addrspace(1) %o
;
; GFX7-HSA-LABEL: constant_zextload_v64i8_to_v64i32:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -4353,7 +4422,10 @@ define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(ptr addrspace(1) %o
;
; GFX7-HSA-LABEL: constant_sextload_v64i8_to_v64i32:
; GFX7-HSA: ; %bb.0:
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
; GFX7-HSA-NEXT: s_load_dwordx4 s[16:19], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -5161,6 +5233,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i64(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_zextload_i8_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5243,6 +5318,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i64(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_sextload_i8_to_i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5328,6 +5406,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5408,6 +5489,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5496,6 +5580,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5603,6 +5690,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5716,10 +5806,13 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_bfe_u32 s4, s2, 0x80008
; GFX7-HSA-NEXT: s_lshr_b32 s3, s2, 24
@@ -5854,6 +5947,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6013,10 +6109,13 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s4, s2, 24
; GFX7-HSA-NEXT: s_lshr_b32 s5, s3, 24
@@ -6235,6 +6334,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -6504,10 +6606,13 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s8, s5, 24
; GFX7-HSA-NEXT: s_lshr_b32 s9, s4, 24
@@ -6898,6 +7003,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -7387,10 +7495,13 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, 0
-; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[8:15], s[2:3], 0x0
+; GFX7-HSA-NEXT: v_mov_b32_e32 v3, v1
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_lshr_b32 s16, s8, 24
; GFX7-HSA-NEXT: s_lshr_b32 s17, s9, 24
@@ -8128,6 +8239,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i64:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -8898,6 +9012,9 @@ define amdgpu_kernel void @constant_zextload_i8_to_i16(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_zextload_i8_to_i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -8982,6 +9099,9 @@ define amdgpu_kernel void @constant_sextload_i8_to_i16(ptr addrspace(1) %out, pt
; GFX7-HSA-LABEL: constant_sextload_i8_to_i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9068,6 +9188,9 @@ define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v1i8_to_v1i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9152,6 +9275,9 @@ define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v1i8_to_v1i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9241,6 +9367,9 @@ define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v2i8_to_v2i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9340,6 +9469,9 @@ define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v2i8_to_v2i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s2
; GFX7-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -9452,6 +9584,9 @@ define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v4i8_to_v4i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -9560,6 +9695,9 @@ define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v4i8_to_v4i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dword s2, s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v0, s0
@@ -9683,6 +9821,9 @@ define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_zextload_v8i8_to_v8i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -9832,6 +9973,9 @@ define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(ptr addrspace(1) %out
; GFX7-HSA-LABEL: constant_sextload_v8i8_to_v8i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0
; GFX7-HSA-NEXT: v_mov_b32_e32 v4, s0
@@ -10014,6 +10158,9 @@ define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v16i8_to_v16i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -10261,6 +10408,9 @@ define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v16i8_to_v16i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx4 s[4:7], s[2:3], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -10574,6 +10724,9 @@ define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_zextload_v32i8_to_v32i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
@@ -11018,6 +11171,9 @@ define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(ptr addrspace(1) %o
; GFX7-HSA-LABEL: constant_sextload_v32i8_to_v32i16:
; GFX7-HSA: ; %bb.0:
; GFX7-HSA-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0
+; GFX7-HSA-NEXT: s_add_i32 s12, s12, s17
+; GFX7-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-HSA-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0
; GFX7-HSA-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
index c5771bc73b945..9054e509cde8e 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -28,6 +28,9 @@ define amdgpu_kernel void @global_load_i16(ptr addrspace(1) %out, ptr addrspace(
; GCN-HSA-LABEL: global_load_i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -133,6 +136,9 @@ define amdgpu_kernel void @global_load_v2i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v2i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -219,6 +225,9 @@ define amdgpu_kernel void @global_load_v3i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v3i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -339,6 +348,9 @@ define amdgpu_kernel void @global_load_v4i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v4i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -424,6 +436,9 @@ define amdgpu_kernel void @global_load_v8i16(ptr addrspace(1) %out, ptr addrspac
; GCN-HSA-LABEL: global_load_v8i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -512,6 +527,9 @@ define amdgpu_kernel void @global_load_v16i16(ptr addrspace(1) %out, ptr addrspa
; GCN-HSA-LABEL: global_load_v16i16:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s0, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3
@@ -662,6 +680,9 @@ define amdgpu_kernel void @global_load_v16i16_align2(ptr addrspace(1) %in, ptr a
; GCN-HSA-LABEL: global_load_v16i16_align2:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1
@@ -811,6 +832,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i32(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_zextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -896,6 +920,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i32(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_sextload_i16_to_i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -984,6 +1011,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1069,6 +1099,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1159,6 +1192,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1258,6 +1294,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1359,6 +1398,9 @@ define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1469,6 +1511,9 @@ define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v3i16_to_v3i32:
; GCN-HSA: ; %bb.0: ; %entry
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1586,6 +1631,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1701,6 +1749,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1823,6 +1874,9 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1972,6 +2026,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2136,6 +2193,9 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -2372,6 +2432,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2643,6 +2706,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -3054,6 +3120,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -3573,6 +3642,9 @@ define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -4377,6 +4449,9 @@ define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v64i16_to_v64i32:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5142,6 +5217,9 @@ define amdgpu_kernel void @global_zextload_i16_to_i64(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_zextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5239,6 +5317,9 @@ define amdgpu_kernel void @global_sextload_i16_to_i64(ptr addrspace(1) %out, ptr
; GCN-HSA-LABEL: global_sextload_i16_to_i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5334,6 +5415,9 @@ define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5426,6 +5510,9 @@ define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v1i16_to_v1i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5524,6 +5611,9 @@ define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5633,6 +5723,9 @@ define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v2i16_to_v2i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5751,6 +5844,9 @@ define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -5896,6 +5992,9 @@ define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v4i16_to_v4i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6056,10 +6155,10 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_zextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: v_mov_b32_e32 v4, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4
-; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6074,8 +6173,11 @@ define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-NEXT: v_mov_b32_e32 v19, s0
; GCN-HSA-NEXT: s_add_u32 s0, s0, 32
; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v6, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v24, s3
; GCN-HSA-NEXT: v_mov_b32_e32 v26, s1
+; GCN-HSA-NEXT: v_mov_b32_e32 v8, v4
+; GCN-HSA-NEXT: v_mov_b32_e32 v10, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v12, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v14, v4
; GCN-HSA-NEXT: v_mov_b32_e32 v16, v4
@@ -6275,6 +6377,9 @@ define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(ptr addrspace(1) %out,
; GCN-HSA-LABEL: global_sextload_v8i16_to_v8i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6525,10 +6630,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: v_mov_b32_e32 v8, 0
-; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8
-; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -6545,7 +6650,10 @@ define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-NEXT: v_mov_b32_e32 v22, s5
; GCN-HSA-NEXT: v_mov_b32_e32 v21, s4
; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50
+; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8
+; GCN-HSA-NEXT: v_mov_b32_e32 v14, v8
; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0
+; GCN-HSA-NEXT: v_mov_b32_e32 v15, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v17, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v18, v8
; GCN-HSA-NEXT: v_mov_b32_e32 v20, v8
@@ -6905,6 +7013,9 @@ define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v16i16_to_v16i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -7376,6 +7487,9 @@ define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_zextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: s_add_u32 s4, s2, 16
; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -8078,6 +8192,9 @@ define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(ptr addrspace(1) %ou
; GCN-HSA-LABEL: global_sextload_v32i16_to_v32i64:
; GCN-HSA: ; %bb.0:
; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCN-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3
diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
index 033a66abcedb9..e8c862a3cb93c 100644
--- a/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -27,6 +27,9 @@ define amdgpu_kernel void @global_load_i32(ptr addrspace(1) %out, ptr addrspace(
; GCNX3-HSA-LABEL: global_load_i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -106,6 +109,9 @@ define amdgpu_kernel void @global_load_v2i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v2i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -186,6 +192,9 @@ define amdgpu_kernel void @global_load_v3i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v3i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -270,6 +279,9 @@ define amdgpu_kernel void @global_load_v4i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v4i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -352,6 +364,9 @@ define amdgpu_kernel void @global_load_v8i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v8i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, s2
@@ -458,6 +473,9 @@ define amdgpu_kernel void @global_load_v9i32(ptr addrspace(1) %out, ptr addrspac
; GCNX3-HSA-LABEL: global_load_v9i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -589,6 +607,9 @@ define amdgpu_kernel void @global_load_v10i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v10i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -719,6 +740,9 @@ define amdgpu_kernel void @global_load_v11i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v11i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -854,6 +878,9 @@ define amdgpu_kernel void @global_load_v12i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v12i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 32
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
@@ -987,6 +1014,9 @@ define amdgpu_kernel void @global_load_v16i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v16i32:
; GCNX3-HSA: ; %bb.0: ; %entry
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -1134,6 +1164,9 @@ define amdgpu_kernel void @global_zextload_i32_to_i64(ptr addrspace(1) %out, ptr
; GCNX3-HSA-LABEL: global_zextload_i32_to_i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1217,6 +1250,9 @@ define amdgpu_kernel void @global_sextload_i32_to_i64(ptr addrspace(1) %out, ptr
; GCNX3-HSA-LABEL: global_sextload_i32_to_i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1301,6 +1337,9 @@ define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v1i32_to_v1i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1384,6 +1423,9 @@ define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v1i32_to_v1i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1471,6 +1513,9 @@ define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v2i32_to_v2i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1569,6 +1614,9 @@ define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v2i32_to_v2i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1674,8 +1722,10 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v4i32_to_v4i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: v_mov_b32_e32 v5, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1683,6 +1733,7 @@ define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 16
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, s3
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v7, v5
; GCNX3-HSA-NEXT: v_mov_b32_e32 v8, s2
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v4, v2
@@ -1800,6 +1851,9 @@ define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v4i32_to_v4i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1941,8 +1995,10 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_zextload_v8i32_to_v8i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: v_mov_b32_e32 v9, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -1957,6 +2013,7 @@ define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-NEXT: v_mov_b32_e32 v15, s3
; GCNX3-HSA-NEXT: v_mov_b32_e32 v14, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 48
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v11, v9
; GCNX3-HSA-NEXT: v_mov_b32_e32 v13, s1
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: v_mov_b32_e32 v12, s0
@@ -2134,6 +2191,9 @@ define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(ptr addrspace(1) %out,
; GCNX3-HSA-LABEL: global_sextload_v8i32_to_v8i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2370,6 +2430,9 @@ define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_sextload_v16i32_to_v16i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -2731,8 +2794,10 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_zextload_v16i32_to_v16i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: v_mov_b32_e32 v17, 0
-; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -2766,6 +2831,7 @@ define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(ptr addrspace(1) %ou
; GCNX3-HSA-NEXT: v_mov_b32_e32 v21, s1
; GCNX3-HSA-NEXT: v_mov_b32_e32 v26, s2
; GCNX3-HSA-NEXT: s_add_u32 s2, s0, 64
+; GCNX3-HSA-NEXT: v_mov_b32_e32 v19, v17
; GCNX3-HSA-NEXT: v_mov_b32_e32 v20, s0
; GCNX3-HSA-NEXT: s_addc_u32 s3, s1, 0
; GCNX3-HSA-NEXT: s_waitcnt vmcnt(3)
@@ -3122,6 +3188,9 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_sextload_v32i32_to_v32i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: v_mov_b32_e32 v0, s2
; GCNX3-HSA-NEXT: v_mov_b32_e32 v1, s3
@@ -3589,12 +3658,12 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
;
; GCN-GFX900-HSA-LABEL: global_sextload_v32i32_to_v32i64:
; GCN-GFX900-HSA: ; %bb.0:
-; GCN-GFX900-HSA-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-GFX900-HSA-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-GFX900-HSA-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-GFX900-HSA-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-GFX900-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v8, 0
-; GCN-GFX900-HSA-NEXT: s_add_u32 s16, s16, s15
-; GCN-GFX900-HSA-NEXT: s_addc_u32 s17, s17, 0
+; GCN-GFX900-HSA-NEXT: s_add_u32 s20, s20, s17
+; GCN-GFX900-HSA-NEXT: s_addc_u32 s21, s21, 0
; GCN-GFX900-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[0:3], v8, s[2:3] offset:96
; GCN-GFX900-HSA-NEXT: global_load_dwordx4 v[4:7], v8, s[2:3] offset:112
@@ -3620,11 +3689,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v4, v0
; GCN-GFX900-HSA-NEXT: v_mov_b32_e32 v6, v1
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[16:19], 0 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v25, off, s[20:23], 0 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: s_nop 0
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[16:19], 0 offset:4 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[16:19], 0 offset:8 ; 4-byte Folded Spill
-; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[16:19], 0 offset:12 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v26, off, s[20:23], 0 offset:4 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v27, off, s[20:23], 0 offset:8 ; 4-byte Folded Spill
+; GCN-GFX900-HSA-NEXT: buffer_store_dword v28, off, s[20:23], 0 offset:12 ; 4-byte Folded Spill
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(7)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v28, 31, v12
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v26, 31, v11
@@ -3667,11 +3736,11 @@ define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[33:36], s[0:1] offset:224
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[29:32], s[0:1] offset:240
; GCN-GFX900-HSA-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:192
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[16:19], 0 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v32, off, s[20:23], 0 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: s_nop 0
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[16:19], 0 offset:4 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[16:19], 0 offset:8 ; 4-byte Folded Reload
-; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[16:19], 0 offset:12 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v33, off, s[20:23], 0 offset:4 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v34, off, s[20:23], 0 offset:8 ; 4-byte Folded Reload
+; GCN-GFX900-HSA-NEXT: buffer_load_dword v35, off, s[20:23], 0 offset:12 ; 4-byte Folded Reload
; GCN-GFX900-HSA-NEXT: s_waitcnt vmcnt(8)
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v60, 31, v52
; GCN-GFX900-HSA-NEXT: v_ashrrev_i32_e32 v58, 31, v51
@@ -3913,6 +3982,9 @@ define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(ptr addrspace(1) %ou
; GCNX3-HSA-LABEL: global_zextload_v32i32_to_v32i64:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
@@ -4437,6 +4509,9 @@ define amdgpu_kernel void @global_load_v32i32(ptr addrspace(1) %out, ptr addrspa
; GCNX3-HSA-LABEL: global_load_v32i32:
; GCNX3-HSA: ; %bb.0:
; GCNX3-HSA-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCNX3-HSA-NEXT: s_add_i32 s12, s12, s17
+; GCNX3-HSA-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCNX3-HSA-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCNX3-HSA-NEXT: s_waitcnt lgkmcnt(0)
; GCNX3-HSA-NEXT: s_add_u32 s4, s2, 16
; GCNX3-HSA-NEXT: s_addc_u32 s5, s3, 0
diff --git a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
index 4dfc773d615e4..1a6fa3c518ca7 100644
--- a/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-select-ptr.ll
@@ -13,7 +13,8 @@
; GCN: s_cselect_b32
; GCN-NOT: load_dword
-; GCN: flat_load_dwordx2
+; GCN: flat_load_dword
+; GCN: flat_load_dword
; GCN-NOT: load_dword
; GCN: flat_store_dwordx2
diff --git a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
index 245a2775d9f2f..07b5e1610cfc0 100644
--- a/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -9,7 +9,7 @@ declare ptr addrspace(4) @llvm.amdgcn.dispatch.ptr() #0
; GCN-LABEL: {{^}}get_global_id_0:
; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
-; GCN: s_mul_i32 [[MUL:s[0-9]+]], s12, [[WGSIZEX]]
+; GCN: s_mul_i32 [[MUL:s[0-9]+]], s14, [[WGSIZEX]]
; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, [[MUL]], v0
define amdgpu_kernel void @get_global_id_0(ptr addrspace(1) %out) #1 {
%dispatch.ptr = call ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
index 401724443567a..bdf1668c35673 100644
--- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -11,8 +11,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
; GFX9-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX9-NEXT: s_mul_i32 s12, s12, s4
-; GFX9-NEXT: s_add_i32 s5, s5, s12
+; GFX9-NEXT: s_mul_i32 s14, s14, s4
+; GFX9-NEXT: s_add_i32 s5, s5, s14
; GFX9-NEXT: v_add_u32_e32 v0, s5, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1]
@@ -39,8 +39,8 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s4, s4, 0xffff
-; GFX10-NEXT: s_mul_i32 s12, s12, s4
-; GFX10-NEXT: v_add3_u32 v0, s5, s12, v0
+; GFX10-NEXT: s_mul_i32 s14, s14, s4
+; GFX10-NEXT: v_add3_u32 v0, s5, s14, v0
; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1]
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4
diff --git a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
index 127656f7aa626..3b3cb8f28ebfd 100644
--- a/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
+++ b/llvm/test/CodeGen/AMDGPU/memcpy-libcall.ll
@@ -9,6 +9,8 @@ define amdgpu_kernel void @memcpy_p0_p0_minsize(ptr %dest, ptr readonly %src) #0
; CHECK-LABEL: memcpy_p0_p0_minsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v12, s3
; CHECK-NEXT: v_mov_b32_e32 v11, s2
@@ -94,12 +96,12 @@ entry:
define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #0 {
; CHECK-LABEL: memcpy_p5_p4_minsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
-; CHECK-NEXT: s_add_u32 s16, s16, s15
+; CHECK-NEXT: s_add_u32 s20, s20, s17
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
@@ -107,50 +109,50 @@ define amdgpu_kernel void @memcpy_p5_p4_minsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
-; CHECK-NEXT: s_addc_u32 s17, s17, 0
+; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: v_mov_b32_e32 v25, s2
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
-; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
-; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
-; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
+; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:112
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(9)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
-; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
-; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
-; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
+; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen offset:96
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
; CHECK-NEXT: s_waitcnt vmcnt(13)
-; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
-; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
-; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
-; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
+; CHECK-NEXT: buffer_store_dword v11, v25, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_store_dword v10, v25, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_store_dword v9, v25, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_store_dword v8, v25, s[20:23], 0 offen offset:80
; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
-; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
-; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
-; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
+; CHECK-NEXT: buffer_store_dword v15, v25, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_store_dword v14, v25, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_store_dword v13, v25, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_store_dword v12, v25, s[20:23], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
-; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
-; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
-; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
+; CHECK-NEXT: buffer_store_dword v19, v25, s[20:23], 0 offen offset:60
+; CHECK-NEXT: buffer_store_dword v18, v25, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_store_dword v17, v25, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_store_dword v16, v25, s[20:23], 0 offen offset:48
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
-; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
-; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
-; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
+; CHECK-NEXT: buffer_store_dword v23, v25, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_store_dword v22, v25, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_store_dword v21, v25, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_store_dword v20, v25, s[20:23], 0 offen offset:32
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
-; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen
+; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -160,55 +162,57 @@ entry:
define amdgpu_kernel void @memcpy_p0_p5_minsize(ptr %generic, ptr addrspace(5) %src) #0 {
; CHECK-LABEL: memcpy_p0_p5_minsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
-; CHECK-NEXT: s_add_u32 s16, s16, s15
-; CHECK-NEXT: s_addc_u32 s17, s17, 0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; CHECK-NEXT: s_add_u32 s20, s20, s17
+; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v26, s0
-; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
-; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48
+; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60
+; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
-; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
-; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64
@@ -268,6 +272,8 @@ define amdgpu_kernel void @memcpy_p0_p3_minsize(ptr %generic) #0 {
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v21, s1
; CHECK-NEXT: v_mov_b32_e32 v20, s0
@@ -294,6 +300,8 @@ define amdgpu_kernel void @memcpy_p0_p0_optsize(ptr %dest, ptr %src) #1 {
; CHECK-LABEL: memcpy_p0_p0_optsize:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v12, s3
; CHECK-NEXT: v_mov_b32_e32 v11, s2
@@ -379,12 +387,12 @@ entry:
define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr addrspace(4) %0) #1 {
; CHECK-LABEL: memcpy_p5_p4_optsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0
; CHECK-NEXT: v_mov_b32_e32 v24, 0
-; CHECK-NEXT: s_add_u32 s16, s16, s15
+; CHECK-NEXT: s_add_u32 s20, s20, s17
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:112
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1] offset:96
@@ -392,50 +400,50 @@ define amdgpu_kernel void @memcpy_p5_p4_optsize(ptr addrspace(5) %local, ptr add
; CHECK-NEXT: global_load_dwordx4 v[12:15], v24, s[0:1] offset:64
; CHECK-NEXT: global_load_dwordx4 v[16:19], v24, s[0:1] offset:48
; CHECK-NEXT: global_load_dwordx4 v[20:23], v24, s[0:1] offset:32
-; CHECK-NEXT: s_addc_u32 s17, s17, 0
+; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: v_mov_b32_e32 v25, s2
; CHECK-NEXT: s_waitcnt vmcnt(5)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:124
-; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:120
-; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:116
-; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:112
+; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:112
; CHECK-NEXT: global_load_dwordx4 v[0:3], v24, s[0:1] offset:16
; CHECK-NEXT: s_waitcnt vmcnt(9)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:108
-; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:104
-; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:100
-; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen offset:96
+; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen offset:96
; CHECK-NEXT: global_load_dwordx4 v[4:7], v24, s[0:1]
; CHECK-NEXT: s_waitcnt vmcnt(13)
-; CHECK-NEXT: buffer_store_dword v11, v25, s[16:19], 0 offen offset:92
-; CHECK-NEXT: buffer_store_dword v10, v25, s[16:19], 0 offen offset:88
-; CHECK-NEXT: buffer_store_dword v9, v25, s[16:19], 0 offen offset:84
-; CHECK-NEXT: buffer_store_dword v8, v25, s[16:19], 0 offen offset:80
+; CHECK-NEXT: buffer_store_dword v11, v25, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_store_dword v10, v25, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_store_dword v9, v25, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_store_dword v8, v25, s[20:23], 0 offen offset:80
; CHECK-NEXT: s_waitcnt vmcnt(16)
-; CHECK-NEXT: buffer_store_dword v15, v25, s[16:19], 0 offen offset:76
-; CHECK-NEXT: buffer_store_dword v14, v25, s[16:19], 0 offen offset:72
-; CHECK-NEXT: buffer_store_dword v13, v25, s[16:19], 0 offen offset:68
-; CHECK-NEXT: buffer_store_dword v12, v25, s[16:19], 0 offen offset:64
+; CHECK-NEXT: buffer_store_dword v15, v25, s[20:23], 0 offen offset:76
+; CHECK-NEXT: buffer_store_dword v14, v25, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_store_dword v13, v25, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_store_dword v12, v25, s[20:23], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(19)
-; CHECK-NEXT: buffer_store_dword v19, v25, s[16:19], 0 offen offset:60
-; CHECK-NEXT: buffer_store_dword v18, v25, s[16:19], 0 offen offset:56
-; CHECK-NEXT: buffer_store_dword v17, v25, s[16:19], 0 offen offset:52
-; CHECK-NEXT: buffer_store_dword v16, v25, s[16:19], 0 offen offset:48
+; CHECK-NEXT: buffer_store_dword v19, v25, s[20:23], 0 offen offset:60
+; CHECK-NEXT: buffer_store_dword v18, v25, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_store_dword v17, v25, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_store_dword v16, v25, s[20:23], 0 offen offset:48
; CHECK-NEXT: s_waitcnt vmcnt(22)
-; CHECK-NEXT: buffer_store_dword v23, v25, s[16:19], 0 offen offset:44
-; CHECK-NEXT: buffer_store_dword v22, v25, s[16:19], 0 offen offset:40
-; CHECK-NEXT: buffer_store_dword v21, v25, s[16:19], 0 offen offset:36
-; CHECK-NEXT: buffer_store_dword v20, v25, s[16:19], 0 offen offset:32
+; CHECK-NEXT: buffer_store_dword v23, v25, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_store_dword v22, v25, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_store_dword v21, v25, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_store_dword v20, v25, s[20:23], 0 offen offset:32
; CHECK-NEXT: s_waitcnt vmcnt(21)
-; CHECK-NEXT: buffer_store_dword v3, v25, s[16:19], 0 offen offset:28
-; CHECK-NEXT: buffer_store_dword v2, v25, s[16:19], 0 offen offset:24
-; CHECK-NEXT: buffer_store_dword v1, v25, s[16:19], 0 offen offset:20
-; CHECK-NEXT: buffer_store_dword v0, v25, s[16:19], 0 offen offset:16
+; CHECK-NEXT: buffer_store_dword v3, v25, s[20:23], 0 offen offset:28
+; CHECK-NEXT: buffer_store_dword v2, v25, s[20:23], 0 offen offset:24
+; CHECK-NEXT: buffer_store_dword v1, v25, s[20:23], 0 offen offset:20
+; CHECK-NEXT: buffer_store_dword v0, v25, s[20:23], 0 offen offset:16
; CHECK-NEXT: s_waitcnt vmcnt(20)
-; CHECK-NEXT: buffer_store_dword v7, v25, s[16:19], 0 offen offset:12
-; CHECK-NEXT: buffer_store_dword v6, v25, s[16:19], 0 offen offset:8
-; CHECK-NEXT: buffer_store_dword v5, v25, s[16:19], 0 offen offset:4
-; CHECK-NEXT: buffer_store_dword v4, v25, s[16:19], 0 offen
+; CHECK-NEXT: buffer_store_dword v7, v25, s[20:23], 0 offen offset:12
+; CHECK-NEXT: buffer_store_dword v6, v25, s[20:23], 0 offen offset:8
+; CHECK-NEXT: buffer_store_dword v5, v25, s[20:23], 0 offen offset:4
+; CHECK-NEXT: buffer_store_dword v4, v25, s[20:23], 0 offen
; CHECK-NEXT: s_endpgm
entry:
tail call void @llvm.memcpy.p5.p4.i64(ptr addrspace(5) %local, ptr addrspace(4) %0, i64 128, i1 false)
@@ -445,55 +453,57 @@ entry:
define amdgpu_kernel void @memcpy_p0_p5_optsize(ptr %generic, ptr addrspace(5) %src) #1 {
; CHECK-LABEL: memcpy_p0_p5_optsize:
; CHECK: ; %bb.0: ; %entry
-; CHECK-NEXT: s_mov_b64 s[18:19], s[2:3]
-; CHECK-NEXT: s_mov_b64 s[16:17], s[0:1]
+; CHECK-NEXT: s_mov_b64 s[22:23], s[2:3]
+; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
-; CHECK-NEXT: s_add_u32 s16, s16, s15
-; CHECK-NEXT: s_addc_u32 s17, s17, 0
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
+; CHECK-NEXT: s_add_u32 s20, s20, s17
+; CHECK-NEXT: s_addc_u32 s21, s21, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v26, s0
-; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:124
-; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:120
-; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:116
-; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:112
-; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:108
-; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:104
-; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:100
-; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen offset:96
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:124
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:120
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:116
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:112
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:108
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:104
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:100
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen offset:96
; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CHECK-NEXT: buffer_load_dword v8, v26, s[16:19], 0 offen offset:16
-; CHECK-NEXT: buffer_load_dword v9, v26, s[16:19], 0 offen offset:20
-; CHECK-NEXT: buffer_load_dword v10, v26, s[16:19], 0 offen offset:24
-; CHECK-NEXT: buffer_load_dword v11, v26, s[16:19], 0 offen offset:28
-; CHECK-NEXT: buffer_load_dword v12, v26, s[16:19], 0 offen offset:32
-; CHECK-NEXT: buffer_load_dword v13, v26, s[16:19], 0 offen offset:36
-; CHECK-NEXT: buffer_load_dword v14, v26, s[16:19], 0 offen offset:40
-; CHECK-NEXT: buffer_load_dword v15, v26, s[16:19], 0 offen offset:44
-; CHECK-NEXT: buffer_load_dword v16, v26, s[16:19], 0 offen offset:48
-; CHECK-NEXT: buffer_load_dword v17, v26, s[16:19], 0 offen offset:52
-; CHECK-NEXT: buffer_load_dword v18, v26, s[16:19], 0 offen offset:56
-; CHECK-NEXT: buffer_load_dword v19, v26, s[16:19], 0 offen offset:60
-; CHECK-NEXT: buffer_load_dword v23, v26, s[16:19], 0 offen offset:92
-; CHECK-NEXT: buffer_load_dword v22, v26, s[16:19], 0 offen offset:88
-; CHECK-NEXT: buffer_load_dword v21, v26, s[16:19], 0 offen offset:84
-; CHECK-NEXT: buffer_load_dword v20, v26, s[16:19], 0 offen offset:80
+; CHECK-NEXT: buffer_load_dword v8, v26, s[20:23], 0 offen offset:16
+; CHECK-NEXT: buffer_load_dword v9, v26, s[20:23], 0 offen offset:20
+; CHECK-NEXT: buffer_load_dword v10, v26, s[20:23], 0 offen offset:24
+; CHECK-NEXT: buffer_load_dword v11, v26, s[20:23], 0 offen offset:28
+; CHECK-NEXT: buffer_load_dword v12, v26, s[20:23], 0 offen offset:32
+; CHECK-NEXT: buffer_load_dword v13, v26, s[20:23], 0 offen offset:36
+; CHECK-NEXT: buffer_load_dword v14, v26, s[20:23], 0 offen offset:40
+; CHECK-NEXT: buffer_load_dword v15, v26, s[20:23], 0 offen offset:44
+; CHECK-NEXT: buffer_load_dword v16, v26, s[20:23], 0 offen offset:48
+; CHECK-NEXT: buffer_load_dword v17, v26, s[20:23], 0 offen offset:52
+; CHECK-NEXT: buffer_load_dword v18, v26, s[20:23], 0 offen offset:56
+; CHECK-NEXT: buffer_load_dword v19, v26, s[20:23], 0 offen offset:60
+; CHECK-NEXT: buffer_load_dword v23, v26, s[20:23], 0 offen offset:92
+; CHECK-NEXT: buffer_load_dword v22, v26, s[20:23], 0 offen offset:88
+; CHECK-NEXT: buffer_load_dword v21, v26, s[20:23], 0 offen offset:84
+; CHECK-NEXT: buffer_load_dword v20, v26, s[20:23], 0 offen offset:80
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v25, s1
; CHECK-NEXT: v_mov_b32_e32 v24, s0
; CHECK-NEXT: s_waitcnt vmcnt(20)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:112
-; CHECK-NEXT: buffer_load_dword v3, v26, s[16:19], 0 offen offset:76
+; CHECK-NEXT: buffer_load_dword v3, v26, s[20:23], 0 offen offset:76
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v2, v26, s[16:19], 0 offen offset:72
-; CHECK-NEXT: buffer_load_dword v1, v26, s[16:19], 0 offen offset:68
-; CHECK-NEXT: buffer_load_dword v0, v26, s[16:19], 0 offen offset:64
+; CHECK-NEXT: buffer_load_dword v2, v26, s[20:23], 0 offen offset:72
+; CHECK-NEXT: buffer_load_dword v1, v26, s[20:23], 0 offen offset:68
+; CHECK-NEXT: buffer_load_dword v0, v26, s[20:23], 0 offen offset:64
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[4:7] offset:96
-; CHECK-NEXT: buffer_load_dword v4, v26, s[16:19], 0 offen
+; CHECK-NEXT: buffer_load_dword v4, v26, s[20:23], 0 offen
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: buffer_load_dword v5, v26, s[16:19], 0 offen offset:4
-; CHECK-NEXT: buffer_load_dword v6, v26, s[16:19], 0 offen offset:8
-; CHECK-NEXT: buffer_load_dword v7, v26, s[16:19], 0 offen offset:12
+; CHECK-NEXT: buffer_load_dword v5, v26, s[20:23], 0 offen offset:4
+; CHECK-NEXT: buffer_load_dword v6, v26, s[20:23], 0 offen offset:8
+; CHECK-NEXT: buffer_load_dword v7, v26, s[20:23], 0 offen offset:12
; CHECK-NEXT: s_nop 0
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[20:23] offset:80
; CHECK-NEXT: flat_store_dwordx4 v[24:25], v[0:3] offset:64
@@ -553,6 +563,8 @@ define amdgpu_kernel void @memcpy_p0_p3_optsize(ptr %generic) #1 {
; CHECK-NEXT: ds_read2_b64 v[4:7], v16 offset0:2 offset1:3
; CHECK-NEXT: ds_read2_b64 v[8:11], v16 offset0:4 offset1:5
; CHECK-NEXT: ds_read2_b64 v[12:15], v16 offset0:6 offset1:7
+; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v21, s1
; CHECK-NEXT: v_mov_b32_e32 v20, s0
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 5af37809443e0..07ad8cb0c4a3d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_agent_unordered_load(
; GFX7-LABEL: flat_agent_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX10-WGP-LABEL: flat_agent_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX10-CU-LABEL: flat_agent_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_agent_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_load(
; GFX7-LABEL: flat_agent_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX10-CU-LABEL: flat_agent_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_load(
; GFX7-LABEL: flat_agent_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX10-WGP-LABEL: flat_agent_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -380,6 +417,10 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX10-CU-LABEL: flat_agent_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -410,6 +451,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -423,6 +466,8 @@ define amdgpu_kernel void @flat_agent_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -531,6 +576,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_load(
; GFX7-LABEL: flat_agent_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -547,6 +595,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -565,6 +617,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -598,6 +654,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -612,6 +670,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -739,6 +799,9 @@ entry:
define amdgpu_kernel void @flat_agent_unordered_store(
; GFX7-LABEL: flat_agent_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -750,6 +813,10 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX10-WGP-LABEL: flat_agent_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -761,6 +828,10 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX10-CU-LABEL: flat_agent_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -783,6 +854,8 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -793,6 +866,8 @@ define amdgpu_kernel void @flat_agent_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -873,6 +948,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_store(
; GFX7-LABEL: flat_agent_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -884,6 +962,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -895,6 +977,10 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX10-CU-LABEL: flat_agent_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -917,6 +1003,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -927,6 +1015,8 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1007,6 +1097,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_store(
; GFX7-LABEL: flat_agent_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1019,6 +1112,10 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX10-WGP-LABEL: flat_agent_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1032,6 +1129,10 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX10-CU-LABEL: flat_agent_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1057,6 +1158,8 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1068,6 +1171,8 @@ define amdgpu_kernel void @flat_agent_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1165,6 +1270,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_store(
; GFX7-LABEL: flat_agent_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1177,6 +1285,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1190,6 +1302,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1215,6 +1331,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1226,6 +1344,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1323,6 +1443,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
; GFX7-LABEL: flat_agent_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1334,6 +1457,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1345,6 +1472,10 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1367,6 +1498,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1377,6 +1510,8 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1457,6 +1592,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
; GFX7-LABEL: flat_agent_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1470,6 +1608,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1485,6 +1627,10 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1512,6 +1658,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1524,6 +1672,8 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1622,6 +1772,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_atomicrmw(
; GFX7-LABEL: flat_agent_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1634,6 +1787,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1647,6 +1804,10 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1672,6 +1833,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1683,6 +1846,8 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1780,6 +1945,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
; GFX7-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1794,6 +1962,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1811,6 +1983,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1841,6 +2017,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1854,6 +2032,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1969,6 +2149,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
; GFX7-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1983,6 +2166,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2000,6 +2187,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2030,6 +2221,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2043,6 +2236,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2158,6 +2353,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2174,6 +2372,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2191,6 +2393,10 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2223,6 +2429,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2237,6 +2445,8 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2352,6 +2562,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2369,6 +2582,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2388,6 +2605,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2423,6 +2644,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2438,6 +2661,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2574,6 +2799,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2591,6 +2819,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2610,6 +2842,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2645,6 +2881,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2660,6 +2898,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2796,6 +3036,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2821,6 +3064,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2846,6 +3093,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2896,6 +3147,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2910,6 +3163,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3019,6 +3274,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3046,6 +3304,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3075,6 +3337,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3130,6 +3396,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3146,6 +3414,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3273,6 +3543,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3299,6 +3572,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3326,6 +3603,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3379,6 +3660,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3394,6 +3677,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3520,6 +3805,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3548,6 +3836,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3579,6 +3871,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3637,6 +3933,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3654,6 +3952,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3798,6 +4098,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3826,6 +4129,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3857,6 +4164,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3915,6 +4226,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3932,6 +4245,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4076,6 +4391,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4103,6 +4421,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4132,6 +4454,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4187,6 +4513,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4203,6 +4531,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4330,6 +4660,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4357,6 +4690,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4386,6 +4723,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4441,6 +4782,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4457,6 +4800,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4584,6 +4929,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4612,6 +4960,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4643,6 +4995,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4701,6 +5057,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4718,6 +5076,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4862,6 +5222,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4890,6 +5253,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4921,6 +5288,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4979,6 +5350,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4996,6 +5369,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5140,6 +5515,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5168,6 +5546,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5199,6 +5581,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5257,6 +5643,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5274,6 +5662,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5418,6 +5808,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5446,6 +5839,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5477,6 +5874,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5535,6 +5936,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5552,6 +5955,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5696,6 +6101,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5724,6 +6132,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5755,6 +6167,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5813,6 +6229,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5830,6 +6248,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5974,6 +6394,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6002,6 +6425,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6033,6 +6460,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6091,6 +6522,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6108,6 +6541,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6252,6 +6687,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6280,6 +6718,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6311,6 +6753,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6369,6 +6815,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6386,6 +6834,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6530,6 +6980,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6558,6 +7011,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6589,6 +7046,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6647,6 +7108,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6664,6 +7127,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6808,6 +7273,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6837,6 +7305,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6866,6 +7338,10 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6924,6 +7400,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6941,6 +7419,8 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7077,6 +7557,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7107,6 +7590,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7138,6 +7625,10 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7198,6 +7689,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7216,6 +7709,8 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7361,6 +7856,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7391,6 +7889,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7422,6 +7924,10 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7483,6 +7989,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7501,6 +8009,8 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7654,6 +8164,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7685,6 +8198,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7718,6 +8235,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7781,6 +8302,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7800,6 +8323,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7966,6 +8491,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7997,6 +8525,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8030,6 +8562,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8093,6 +8629,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8112,6 +8650,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8278,6 +8818,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8308,6 +8851,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8339,6 +8886,10 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8399,6 +8950,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8417,6 +8970,8 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8566,6 +9121,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8596,6 +9154,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8627,6 +9189,10 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8687,6 +9253,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8705,6 +9273,8 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8850,6 +9420,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8881,6 +9454,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8914,6 +9491,10 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8977,6 +9558,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8996,6 +9579,8 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9162,6 +9747,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9193,6 +9781,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9226,6 +9818,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9289,6 +9885,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9308,6 +9906,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9474,6 +10074,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9505,6 +10108,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9538,6 +10145,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9601,6 +10212,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9620,6 +10233,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9786,6 +10401,9 @@ entry:
define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9817,6 +10435,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9850,6 +10472,10 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9913,6 +10539,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9932,6 +10560,8 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10098,6 +10728,9 @@ entry:
define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10129,6 +10762,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10162,6 +10799,10 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10225,6 +10866,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10244,6 +10887,8 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10406,6 +11051,9 @@ entry:
define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10437,6 +11085,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10470,6 +11122,10 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10533,6 +11189,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10552,6 +11210,8 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10718,6 +11378,9 @@ entry:
define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10749,6 +11412,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10782,6 +11449,10 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10845,6 +11516,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10864,6 +11537,8 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11030,6 +11705,9 @@ entry:
define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11061,6 +11739,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11094,6 +11776,10 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11157,6 +11843,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11176,6 +11864,8 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11342,6 +12032,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_unordered_load(
; GFX7-LABEL: flat_agent_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11356,6 +12049,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11370,6 +12067,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11398,6 +12099,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11410,6 +12113,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11509,6 +12214,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
; GFX7-LABEL: flat_agent_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11523,6 +12231,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11537,6 +12249,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11565,6 +12281,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11577,6 +12295,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11676,6 +12396,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_load(
; GFX7-LABEL: flat_agent_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11692,6 +12415,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11709,6 +12436,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11741,6 +12472,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11755,6 +12488,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11868,6 +12603,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
; GFX7-LABEL: flat_agent_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11885,6 +12623,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11904,6 +12646,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11939,6 +12685,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11954,6 +12702,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12086,6 +12836,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_unordered_store(
; GFX7-LABEL: flat_agent_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12097,6 +12850,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12108,6 +12865,10 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12130,6 +12891,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12140,6 +12903,8 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12220,6 +12985,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
; GFX7-LABEL: flat_agent_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12231,6 +12999,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12242,6 +13014,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12264,6 +13040,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12274,6 +13052,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12354,6 +13134,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_store(
; GFX7-LABEL: flat_agent_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12366,6 +13149,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12379,6 +13166,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12404,6 +13195,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12415,6 +13208,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12512,6 +13307,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
; GFX7-LABEL: flat_agent_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12524,6 +13322,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12537,6 +13339,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12562,6 +13368,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12573,6 +13381,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12670,6 +13480,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12681,6 +13494,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12692,6 +13509,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12714,6 +13535,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12724,6 +13547,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12804,6 +13629,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12817,6 +13645,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12831,6 +13663,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12857,6 +13693,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12869,6 +13707,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12965,6 +13805,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12977,6 +13820,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12990,6 +13837,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13015,6 +13866,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13026,6 +13879,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13123,6 +13978,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13137,6 +13995,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13153,6 +14015,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13182,6 +14048,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13195,6 +14063,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13308,6 +14178,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13322,6 +14195,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13338,6 +14215,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13367,6 +14248,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13380,6 +14263,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13493,6 +14378,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13510,6 +14398,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13528,6 +14420,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13562,6 +14458,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13577,6 +14475,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13697,6 +14597,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13715,6 +14618,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13735,6 +14642,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13772,6 +14683,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13788,6 +14701,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13929,6 +14844,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13947,6 +14865,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13967,6 +14889,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14004,6 +14930,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14020,6 +14948,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14161,6 +15091,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14186,6 +15119,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14211,6 +15148,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14261,6 +15202,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14275,6 +15218,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14384,6 +15329,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14411,6 +15359,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14439,6 +15391,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14493,6 +15449,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14509,6 +15467,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14634,6 +15594,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14660,6 +15623,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14687,6 +15654,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14740,6 +15711,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14755,6 +15728,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14881,6 +15856,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14909,6 +15887,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14939,6 +15921,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14996,6 +15982,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15013,6 +16001,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15155,6 +16145,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15183,6 +16176,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15213,6 +16210,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15270,6 +16271,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15287,6 +16290,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15429,6 +16434,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15456,6 +16464,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15484,6 +16496,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15538,6 +16554,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15554,6 +16572,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15679,6 +16699,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15706,6 +16729,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15734,6 +16761,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15788,6 +16819,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15804,6 +16837,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15929,6 +16964,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15957,6 +16995,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15987,6 +17029,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16044,6 +17090,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16061,6 +17109,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16203,6 +17253,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16231,6 +17284,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16261,6 +17318,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16318,6 +17379,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16335,6 +17398,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16477,6 +17542,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16505,6 +17573,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16535,6 +17607,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16592,6 +17668,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16609,6 +17687,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16751,6 +17831,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16779,6 +17862,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16809,6 +17896,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16866,6 +17957,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16883,6 +17976,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17025,6 +18120,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17053,6 +18151,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17083,6 +18185,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17140,6 +18246,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17157,6 +18265,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17299,6 +18409,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17327,6 +18440,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17357,6 +18474,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17414,6 +18535,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17431,6 +18554,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17573,6 +18698,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17601,6 +18729,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17631,6 +18763,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17688,6 +18824,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17705,6 +18843,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17847,6 +18987,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17875,6 +19018,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17905,6 +19052,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17962,6 +19113,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17979,6 +19132,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18121,6 +19276,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18150,6 +19308,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18179,6 +19341,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18237,6 +19403,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18254,6 +19422,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18390,6 +19560,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18421,6 +19594,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18453,6 +19630,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18515,6 +19696,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18534,6 +19717,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18684,6 +19869,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18714,6 +19902,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18745,6 +19937,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18806,6 +20002,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18824,6 +20022,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18977,6 +20177,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19009,6 +20212,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19043,6 +20250,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19108,6 +20319,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19128,6 +20341,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19299,6 +20514,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19331,6 +20549,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19365,6 +20587,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19430,6 +20656,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19450,6 +20678,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19621,6 +20851,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19652,6 +20885,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19684,6 +20921,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19746,6 +20987,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19765,6 +21008,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19919,6 +21164,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19950,6 +21198,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19982,6 +21234,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20044,6 +21300,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20063,6 +21321,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20213,6 +21473,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20245,6 +21508,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20279,6 +21546,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20344,6 +21615,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20364,6 +21637,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20535,6 +21810,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20567,6 +21845,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20601,6 +21883,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20666,6 +21952,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20686,6 +21974,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20857,6 +22147,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20889,6 +22182,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20923,6 +22220,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20988,6 +22289,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21008,6 +22311,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21179,6 +22484,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21211,6 +22519,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21245,6 +22557,10 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21310,6 +22626,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21330,6 +22648,8 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21501,6 +22821,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21533,6 +22856,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21567,6 +22894,10 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21632,6 +22963,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21652,6 +22985,8 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21819,6 +23154,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21851,6 +23189,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21885,6 +23227,10 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21950,6 +23296,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21970,6 +23318,8 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22141,6 +23491,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22173,6 +23526,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22207,6 +23564,10 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22272,6 +23633,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22292,6 +23655,8 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22463,6 +23828,9 @@ entry:
define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22495,6 +23863,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22529,6 +23901,10 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22594,6 +23970,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22614,6 +23992,8 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 30c0a322d7ddc..3c24c36ec547d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-CU-LABEL: flat_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX7-LABEL: flat_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -211,6 +229,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
@@ -240,6 +262,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-CU-LABEL: flat_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
@@ -298,6 +324,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_load_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_nop 0
@@ -329,6 +357,8 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_load_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90A-TGSPLIT-NEXT: s_nop 0
@@ -537,6 +567,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX7-LABEL: flat_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -551,6 +584,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -565,6 +602,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-CU-LABEL: flat_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -593,6 +634,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -605,6 +648,8 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -704,6 +749,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX7-LABEL: flat_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -732,6 +780,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -759,6 +811,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-CU-LABEL: flat_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -814,6 +870,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_store_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -843,6 +901,8 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_store_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1047,6 +1107,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_volatile_load(
; GFX7-LABEL: flat_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1062,6 +1125,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX10-WGP-LABEL: flat_nontemporal_volatile_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1077,6 +1144,10 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX10-CU-LABEL: flat_nontemporal_volatile_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1107,6 +1178,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_nontemporal_volatile_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1120,6 +1193,8 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
;
; GFX90A-TGSPLIT-LABEL: flat_nontemporal_volatile_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index b80dfaea01653..b88a10ab24a98 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_singlethread_unordered_load(
; GFX7-LABEL: flat_singlethread_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX10-WGP-LABEL: flat_singlethread_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX10-CU-LABEL: flat_singlethread_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_load(
; GFX7-LABEL: flat_singlethread_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_load(
; GFX7-LABEL: flat_singlethread_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -363,6 +396,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -377,6 +414,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -405,6 +446,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -417,6 +460,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -516,6 +561,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_load(
; GFX7-LABEL: flat_singlethread_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -530,6 +578,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -544,6 +596,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -572,6 +628,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -584,6 +642,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -683,6 +743,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_unordered_store(
; GFX7-LABEL: flat_singlethread_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -694,6 +757,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX10-WGP-LABEL: flat_singlethread_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -705,6 +772,10 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX10-CU-LABEL: flat_singlethread_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -727,6 +798,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -737,6 +810,8 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -817,6 +892,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_store(
; GFX7-LABEL: flat_singlethread_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -828,6 +906,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -839,6 +921,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -861,6 +947,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -871,6 +959,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -951,6 +1041,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_store(
; GFX7-LABEL: flat_singlethread_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -962,6 +1055,10 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX10-WGP-LABEL: flat_singlethread_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -973,6 +1070,10 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX10-CU-LABEL: flat_singlethread_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -995,6 +1096,8 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1005,6 +1108,8 @@ define amdgpu_kernel void @flat_singlethread_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1085,6 +1190,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_store(
; GFX7-LABEL: flat_singlethread_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1096,6 +1204,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1107,6 +1219,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1129,6 +1245,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1139,6 +1257,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1219,6 +1339,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
; GFX7-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1230,6 +1353,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1241,6 +1368,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1263,6 +1394,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1273,6 +1406,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1353,6 +1488,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
; GFX7-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1364,6 +1502,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1375,6 +1517,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1397,6 +1543,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1407,6 +1555,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1487,6 +1637,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
; GFX7-LABEL: flat_singlethread_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1498,6 +1651,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1509,6 +1666,10 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1531,6 +1692,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1541,6 +1704,8 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1621,6 +1786,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
; GFX7-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1632,6 +1800,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1643,6 +1815,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1665,6 +1841,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1675,6 +1853,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1755,6 +1935,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
; GFX7-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1766,6 +1949,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1777,6 +1964,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1799,6 +1990,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1809,6 +2002,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1889,6 +2084,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1904,6 +2102,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1919,6 +2121,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1949,6 +2155,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1962,6 +2170,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2068,6 +2278,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2083,6 +2296,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2098,6 +2315,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2128,6 +2349,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2141,6 +2364,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2247,6 +2472,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2262,6 +2490,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2277,6 +2509,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2307,6 +2543,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2320,6 +2558,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2426,6 +2666,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2451,6 +2694,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2476,6 +2723,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2526,6 +2777,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2540,6 +2793,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2649,6 +2904,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2674,6 +2932,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2699,6 +2961,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2749,6 +3015,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2763,6 +3031,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2872,6 +3142,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2897,6 +3170,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2922,6 +3199,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2972,6 +3253,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2986,6 +3269,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3095,6 +3380,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3120,6 +3408,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3145,6 +3437,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3195,6 +3491,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3209,6 +3507,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3318,6 +3618,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3343,6 +3646,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3368,6 +3675,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3418,6 +3729,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3432,6 +3745,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3541,6 +3856,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3566,6 +3884,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3591,6 +3913,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3641,6 +3967,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3655,6 +3983,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3764,6 +4094,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3789,6 +4122,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3814,6 +4151,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3864,6 +4205,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3878,6 +4221,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3987,6 +4332,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4012,6 +4360,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4037,6 +4389,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4087,6 +4443,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4101,6 +4459,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4210,6 +4570,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4235,6 +4598,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4260,6 +4627,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4310,6 +4681,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4324,6 +4697,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4433,6 +4808,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4458,6 +4836,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4483,6 +4865,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4533,6 +4919,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4547,6 +4935,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4656,6 +5046,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4681,6 +5074,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4706,6 +5103,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4756,6 +5157,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4770,6 +5173,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4879,6 +5284,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4904,6 +5312,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4929,6 +5341,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4979,6 +5395,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4993,6 +5411,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5102,6 +5522,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5127,6 +5550,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5152,6 +5579,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5202,6 +5633,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5216,6 +5649,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5325,6 +5760,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5350,6 +5788,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5375,6 +5817,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5425,6 +5871,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5439,6 +5887,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5548,6 +5998,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5573,6 +6026,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5598,6 +6055,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5648,6 +6109,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5662,6 +6125,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5771,6 +6236,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -5800,6 +6268,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5829,6 +6301,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5887,6 +6363,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5904,6 +6382,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6040,6 +6520,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6069,6 +6552,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6098,6 +6585,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6156,6 +6647,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6173,6 +6666,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6309,6 +6804,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6338,6 +6836,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6367,6 +6869,10 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6425,6 +6931,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6442,6 +6950,8 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6578,6 +7088,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6607,6 +7120,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6636,6 +7153,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6694,6 +7215,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6711,6 +7234,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6847,6 +7372,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6876,6 +7404,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6905,6 +7437,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6963,6 +7499,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6980,6 +7518,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7116,6 +7656,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7145,6 +7688,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7174,6 +7721,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7232,6 +7783,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7249,6 +7802,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7385,6 +7940,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7414,6 +7972,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7443,6 +8005,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7501,6 +8067,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7518,6 +8086,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7654,6 +8224,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7683,6 +8256,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7712,6 +8289,10 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7770,6 +8351,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7787,6 +8370,8 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7923,6 +8508,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7952,6 +8540,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7981,6 +8573,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8039,6 +8635,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8056,6 +8654,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8192,6 +8792,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8221,6 +8824,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8250,6 +8857,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8308,6 +8919,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8325,6 +8938,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8461,6 +9076,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8490,6 +9108,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8519,6 +9141,10 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8577,6 +9203,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8594,6 +9222,8 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8730,6 +9360,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8759,6 +9392,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8788,6 +9425,10 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8846,6 +9487,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8863,6 +9506,8 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8999,6 +9644,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9028,6 +9676,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9057,6 +9709,10 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9115,6 +9771,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9132,6 +9790,8 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9268,6 +9928,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9297,6 +9960,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9326,6 +9993,10 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9384,6 +10055,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9401,6 +10074,8 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9537,6 +10212,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9566,6 +10244,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9595,6 +10277,10 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9653,6 +10339,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9670,6 +10358,8 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9806,6 +10496,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
; GFX7-LABEL: flat_singlethread_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9820,6 +10513,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -9834,6 +10531,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -9862,6 +10563,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9874,6 +10577,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9973,6 +10678,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9987,6 +10695,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10001,6 +10713,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10029,6 +10745,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10041,6 +10759,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10140,6 +10860,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
; GFX7-LABEL: flat_singlethread_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10154,6 +10877,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10168,6 +10895,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10196,6 +10927,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10208,6 +10941,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10307,6 +11042,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10321,6 +11059,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10335,6 +11077,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10363,6 +11109,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10375,6 +11123,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10474,6 +11224,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
; GFX7-LABEL: flat_singlethread_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10485,6 +11238,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10496,6 +11253,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10518,6 +11279,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10528,6 +11291,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10608,6 +11373,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10619,6 +11387,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10630,6 +11402,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10652,6 +11428,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10662,6 +11440,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10742,6 +11522,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_store(
; GFX7-LABEL: flat_singlethread_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10753,6 +11536,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10764,6 +11551,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10786,6 +11577,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10796,6 +11589,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10876,6 +11671,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10887,6 +11685,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10898,6 +11700,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10920,6 +11726,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10930,6 +11738,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11010,6 +11820,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11021,6 +11834,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11032,6 +11849,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11054,6 +11875,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11064,6 +11887,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11144,6 +11969,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11155,6 +11983,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11166,6 +11998,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11188,6 +12024,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11198,6 +12036,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11278,6 +12118,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11289,6 +12132,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11300,6 +12147,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11322,6 +12173,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11332,6 +12185,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11412,6 +12267,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11423,6 +12281,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11434,6 +12296,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11456,6 +12322,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11466,6 +12334,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11546,6 +12416,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11557,6 +12430,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11568,6 +12445,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11590,6 +12471,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11600,6 +12483,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11680,6 +12565,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11695,6 +12583,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11710,6 +12602,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11740,6 +12636,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11753,6 +12651,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11859,6 +12759,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11874,6 +12777,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11889,6 +12796,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11919,6 +12830,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11932,6 +12845,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12038,6 +12953,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12053,6 +12971,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12068,6 +12990,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12098,6 +13024,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12111,6 +13039,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12217,6 +13147,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12242,6 +13175,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12267,6 +13204,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12317,6 +13258,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12331,6 +13274,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12440,6 +13385,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12465,6 +13413,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12490,6 +13442,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12540,6 +13496,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12554,6 +13512,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12663,6 +13623,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12688,6 +13651,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12713,6 +13680,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12763,6 +13734,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12777,6 +13750,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12886,6 +13861,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12911,6 +13889,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12936,6 +13918,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12986,6 +13972,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13000,6 +13988,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13109,6 +14099,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13134,6 +14127,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13159,6 +14156,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13209,6 +14210,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13223,6 +14226,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13332,6 +14337,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13357,6 +14365,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13382,6 +14394,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13432,6 +14448,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13446,6 +14464,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13555,6 +14575,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13580,6 +14603,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13605,6 +14632,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13655,6 +14686,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13669,6 +14702,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13778,6 +14813,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13803,6 +14841,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13828,6 +14870,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13878,6 +14924,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13892,6 +14940,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14001,6 +15051,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14026,6 +15079,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14051,6 +15108,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14101,6 +15162,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14115,6 +15178,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14224,6 +15289,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14249,6 +15317,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14274,6 +15346,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14324,6 +15400,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14338,6 +15416,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14447,6 +15527,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14472,6 +15555,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14497,6 +15584,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14547,6 +15638,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14561,6 +15654,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14670,6 +15765,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14695,6 +15793,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14720,6 +15822,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14770,6 +15876,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14784,6 +15892,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14893,6 +16003,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14918,6 +16031,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14943,6 +16060,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14993,6 +16114,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15007,6 +16130,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15116,6 +16241,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15141,6 +16269,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15166,6 +16298,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15216,6 +16352,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15230,6 +16368,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15339,6 +16479,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15364,6 +16507,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15389,6 +16536,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15439,6 +16590,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15453,6 +16606,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15562,6 +16717,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15591,6 +16749,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15620,6 +16782,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15678,6 +16844,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15695,6 +16863,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15831,6 +17001,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15860,6 +17033,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15889,6 +17066,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15947,6 +17128,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15964,6 +17147,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16100,6 +17285,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16129,6 +17317,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16158,6 +17350,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16216,6 +17412,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16233,6 +17431,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16369,6 +17569,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16398,6 +17601,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16427,6 +17634,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16485,6 +17696,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16502,6 +17715,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16638,6 +17853,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16667,6 +17885,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16696,6 +17918,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16754,6 +17980,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16771,6 +17999,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16907,6 +18137,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16936,6 +18169,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16965,6 +18202,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17023,6 +18264,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17040,6 +18283,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17176,6 +18421,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17205,6 +18453,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17234,6 +18486,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17292,6 +18548,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17309,6 +18567,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17445,6 +18705,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17474,6 +18737,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17503,6 +18770,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17561,6 +18832,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17578,6 +18851,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17714,6 +18989,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17743,6 +19021,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17772,6 +19054,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17830,6 +19116,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17847,6 +19135,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17983,6 +19273,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18012,6 +19305,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18041,6 +19338,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18099,6 +19400,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18116,6 +19419,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18252,6 +19557,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18281,6 +19589,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18310,6 +19622,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX10-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18368,6 +19684,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18385,6 +19703,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18521,6 +19841,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18550,6 +19873,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18579,6 +19906,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18637,6 +19968,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18654,6 +19987,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18790,6 +20125,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18819,6 +20157,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18848,6 +20190,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18906,6 +20252,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18923,6 +20271,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19059,6 +20409,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19088,6 +20441,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19117,6 +20474,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19175,6 +20536,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19192,6 +20555,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19328,6 +20693,9 @@ entry:
define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19357,6 +20725,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19386,6 +20758,10 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19444,6 +20820,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19461,6 +20839,8 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index 1ec942ea5f47b..919fc3e8f4e4f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_system_unordered_load(
; GFX7-LABEL: flat_system_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX10-WGP-LABEL: flat_system_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX10-CU-LABEL: flat_system_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_system_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_load(
; GFX7-LABEL: flat_system_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX10-WGP-LABEL: flat_system_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX10-CU-LABEL: flat_system_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_system_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_load(
; GFX7-LABEL: flat_system_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX10-WGP-LABEL: flat_system_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -380,6 +417,10 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX10-CU-LABEL: flat_system_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -410,6 +451,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -424,6 +467,8 @@ define amdgpu_kernel void @flat_system_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -533,6 +578,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_load(
; GFX7-LABEL: flat_system_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -549,6 +597,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -567,6 +619,10 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX10-CU-LABEL: flat_system_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -600,6 +656,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -615,6 +673,8 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -743,6 +803,9 @@ entry:
define amdgpu_kernel void @flat_system_unordered_store(
; GFX7-LABEL: flat_system_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -754,6 +817,10 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX10-WGP-LABEL: flat_system_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -765,6 +832,10 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX10-CU-LABEL: flat_system_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -787,6 +858,8 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -797,6 +870,8 @@ define amdgpu_kernel void @flat_system_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -877,6 +952,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_store(
; GFX7-LABEL: flat_system_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -888,6 +966,10 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX10-WGP-LABEL: flat_system_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -899,6 +981,10 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX10-CU-LABEL: flat_system_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -921,6 +1007,8 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -931,6 +1019,8 @@ define amdgpu_kernel void @flat_system_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1011,6 +1101,9 @@ entry:
define amdgpu_kernel void @flat_system_release_store(
; GFX7-LABEL: flat_system_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1023,6 +1116,10 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX10-WGP-LABEL: flat_system_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1036,6 +1133,10 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX10-CU-LABEL: flat_system_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1061,6 +1162,8 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1073,6 +1176,8 @@ define amdgpu_kernel void @flat_system_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1173,6 +1278,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_store(
; GFX7-LABEL: flat_system_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1185,6 +1293,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1198,6 +1310,10 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX10-CU-LABEL: flat_system_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1223,6 +1339,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1235,6 +1353,8 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1335,6 +1455,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
; GFX7-LABEL: flat_system_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1346,6 +1469,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1357,6 +1484,10 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1379,6 +1510,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1389,6 +1522,8 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1469,6 +1604,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_atomicrmw(
; GFX7-LABEL: flat_system_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1482,6 +1620,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1497,6 +1639,10 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1524,6 +1670,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1537,6 +1685,8 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1636,6 +1786,9 @@ entry:
define amdgpu_kernel void @flat_system_release_atomicrmw(
; GFX7-LABEL: flat_system_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1648,6 +1801,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1661,6 +1818,10 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1686,6 +1847,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1698,6 +1861,8 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1798,6 +1963,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
; GFX7-LABEL: flat_system_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1812,6 +1980,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1829,6 +2001,10 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1859,6 +2035,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1874,6 +2052,8 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1993,6 +2173,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
; GFX7-LABEL: flat_system_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2007,6 +2190,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2024,6 +2211,10 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2054,6 +2245,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2069,6 +2262,8 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2188,6 +2383,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2204,6 +2402,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2221,6 +2423,10 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2253,6 +2459,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2268,6 +2476,8 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2384,6 +2594,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2401,6 +2614,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2420,6 +2637,10 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2455,6 +2676,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2472,6 +2695,8 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2612,6 +2837,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2629,6 +2857,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2648,6 +2880,10 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2683,6 +2919,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2700,6 +2938,8 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2840,6 +3080,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2865,6 +3108,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2890,6 +3137,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2940,6 +3191,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2954,6 +3207,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3063,6 +3318,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3090,6 +3348,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3119,6 +3381,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3174,6 +3440,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3191,6 +3459,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3319,6 +3589,9 @@ entry:
define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3345,6 +3618,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3372,6 +3649,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3425,6 +3706,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3441,6 +3724,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3570,6 +3855,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3598,6 +3886,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3629,6 +3921,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3687,6 +3983,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3706,6 +4004,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3854,6 +4154,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3882,6 +4185,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3913,6 +4220,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3971,6 +4282,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3990,6 +4303,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4138,6 +4453,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4165,6 +4483,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4194,6 +4516,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4249,6 +4575,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4266,6 +4594,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4394,6 +4724,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4421,6 +4754,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4450,6 +4787,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4505,6 +4846,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4522,6 +4865,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4650,6 +4995,9 @@ entry:
define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
; GFX7-LABEL: flat_system_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4678,6 +5026,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4709,6 +5061,10 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4767,6 +5123,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4786,6 +5144,8 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4934,6 +5294,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4962,6 +5325,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4993,6 +5360,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5051,6 +5422,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5070,6 +5443,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5218,6 +5593,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5246,6 +5624,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5277,6 +5659,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5335,6 +5721,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5354,6 +5742,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5502,6 +5892,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5530,6 +5923,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5561,6 +5958,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5619,6 +6020,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5638,6 +6041,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5786,6 +6191,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5814,6 +6222,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5845,6 +6257,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5903,6 +6319,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5922,6 +6340,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6070,6 +6490,9 @@ entry:
define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6098,6 +6521,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6129,6 +6556,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6187,6 +6618,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6206,6 +6639,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6354,6 +6789,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6382,6 +6820,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6413,6 +6855,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6471,6 +6917,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6490,6 +6938,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6638,6 +7088,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6666,6 +7119,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6697,6 +7154,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -6755,6 +7216,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6774,6 +7237,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6922,6 +7387,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6951,6 +7419,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6980,6 +7452,10 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7038,6 +7514,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7055,6 +7533,8 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7191,6 +7671,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7221,6 +7704,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7252,6 +7739,10 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7312,6 +7803,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7331,6 +7824,8 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7477,6 +7972,9 @@ entry:
define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7507,6 +8005,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7538,6 +8040,10 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7599,6 +8105,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7618,6 +8126,8 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7774,6 +8284,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7805,6 +8318,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7838,6 +8355,10 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7901,6 +8422,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7922,6 +8445,8 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8092,6 +8617,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8123,6 +8651,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8156,6 +8688,10 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8219,6 +8755,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8240,6 +8778,8 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8410,6 +8950,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8440,6 +8983,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8471,6 +9018,10 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8531,6 +9082,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8550,6 +9103,8 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8700,6 +9255,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8730,6 +9288,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8761,6 +9323,10 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8821,6 +9387,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8840,6 +9408,8 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8986,6 +9556,9 @@ entry:
define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9017,6 +9590,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9050,6 +9627,10 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9113,6 +9694,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9134,6 +9717,8 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9304,6 +9889,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9335,6 +9923,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9368,6 +9960,10 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9431,6 +10027,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9452,6 +10050,8 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9622,6 +10222,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9653,6 +10256,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9686,6 +10293,10 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9749,6 +10360,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9770,6 +10383,8 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9940,6 +10555,9 @@ entry:
define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9971,6 +10589,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10004,6 +10626,10 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10067,6 +10693,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10088,6 +10716,8 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10258,6 +10888,9 @@ entry:
define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10289,6 +10922,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10322,6 +10959,10 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10385,6 +11026,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10406,6 +11049,8 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10572,6 +11217,9 @@ entry:
define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10603,6 +11251,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10636,6 +11288,10 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10699,6 +11355,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10720,6 +11378,8 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -10890,6 +11550,9 @@ entry:
define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10921,6 +11584,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -10954,6 +11621,10 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11017,6 +11688,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11038,6 +11711,8 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11208,6 +11883,9 @@ entry:
define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11239,6 +11917,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11272,6 +11954,10 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -11335,6 +12021,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11356,6 +12044,8 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -11526,6 +12216,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_unordered_load(
; GFX7-LABEL: flat_system_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11540,6 +12233,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11554,6 +12251,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_system_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11582,6 +12283,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11594,6 +12297,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11693,6 +12398,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_load(
; GFX7-LABEL: flat_system_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11707,6 +12415,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11721,6 +12433,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11749,6 +12465,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11761,6 +12479,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11860,6 +12580,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_load(
; GFX7-LABEL: flat_system_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11876,6 +12599,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11893,6 +12620,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11925,6 +12656,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11940,6 +12673,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12054,6 +12789,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
; GFX7-LABEL: flat_system_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12071,6 +12809,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12090,6 +12832,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12125,6 +12871,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12141,6 +12889,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12274,6 +13024,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_unordered_store(
; GFX7-LABEL: flat_system_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12285,6 +13038,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12296,6 +13053,10 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_system_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12318,6 +13079,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12328,6 +13091,8 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12408,6 +13173,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_store(
; GFX7-LABEL: flat_system_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12419,6 +13187,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12430,6 +13202,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12452,6 +13228,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12462,6 +13240,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12542,6 +13322,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_store(
; GFX7-LABEL: flat_system_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12554,6 +13337,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12567,6 +13354,10 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX10-CU-LABEL: flat_system_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12592,6 +13383,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12604,6 +13397,8 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12704,6 +13499,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
; GFX7-LABEL: flat_system_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12716,6 +13514,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12729,6 +13531,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12754,6 +13560,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12766,6 +13574,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12866,6 +13676,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12877,6 +13690,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12888,6 +13705,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12910,6 +13731,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12920,6 +13743,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13000,6 +13825,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13013,6 +13841,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13027,6 +13859,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13053,6 +13889,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13066,6 +13904,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13163,6 +14003,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
; GFX7-LABEL: flat_system_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13175,6 +14018,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13188,6 +14035,10 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13213,6 +14064,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13225,6 +14078,8 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13325,6 +14180,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13339,6 +14197,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13355,6 +14217,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13384,6 +14250,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13399,6 +14267,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13516,6 +14386,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13530,6 +14403,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13546,6 +14423,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13575,6 +14456,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13590,6 +14473,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13707,6 +14592,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13724,6 +14612,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13742,6 +14634,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13776,6 +14672,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13792,6 +14690,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -13913,6 +14813,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13931,6 +14834,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -13951,6 +14858,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -13988,6 +14899,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14006,6 +14919,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14151,6 +15066,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14169,6 +15087,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -14189,6 +15111,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -14226,6 +15152,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14244,6 +15172,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -14389,6 +15319,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14414,6 +15347,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14439,6 +15376,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14489,6 +15430,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14503,6 +15446,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14612,6 +15557,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14639,6 +15587,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14667,6 +15619,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14721,6 +15677,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14738,6 +15696,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14864,6 +15824,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14890,6 +15853,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14917,6 +15884,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14970,6 +15941,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14986,6 +15959,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15115,6 +16090,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15143,6 +16121,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15173,6 +16155,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15230,6 +16216,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15249,6 +16237,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15395,6 +16385,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15423,6 +16416,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15453,6 +16450,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15510,6 +16511,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15529,6 +16532,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15675,6 +16680,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15702,6 +16710,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15730,6 +16742,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15784,6 +16800,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15801,6 +16819,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15927,6 +16947,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15954,6 +16977,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15982,6 +17009,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16036,6 +17067,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16053,6 +17086,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16179,6 +17214,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16207,6 +17245,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16237,6 +17279,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16294,6 +17340,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16313,6 +17361,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16459,6 +17509,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16487,6 +17540,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16517,6 +17574,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16574,6 +17635,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16593,6 +17656,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16739,6 +17804,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16767,6 +17835,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16797,6 +17869,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -16854,6 +17930,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16873,6 +17951,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17019,6 +18099,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17047,6 +18130,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17077,6 +18164,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17134,6 +18225,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17153,6 +18246,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17299,6 +18394,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17327,6 +18425,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17357,6 +18459,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17414,6 +18520,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17433,6 +18541,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17579,6 +18689,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17607,6 +18720,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17637,6 +18754,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17694,6 +18815,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17713,6 +18836,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17859,6 +18984,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17887,6 +19015,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17917,6 +19049,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -17974,6 +19110,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17993,6 +19131,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18139,6 +19279,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -18167,6 +19310,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18197,6 +19344,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -18254,6 +19405,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18273,6 +19426,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18419,6 +19574,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18448,6 +19606,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18477,6 +19639,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18535,6 +19701,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18552,6 +19720,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18688,6 +19858,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18719,6 +19892,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18751,6 +19928,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18813,6 +19994,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18833,6 +20016,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18984,6 +20169,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19014,6 +20202,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19045,6 +20237,10 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19106,6 +20302,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19125,6 +20323,8 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19281,6 +20481,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19313,6 +20516,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19347,6 +20554,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19412,6 +20623,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19434,6 +20647,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19609,6 +20824,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19641,6 +20859,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19675,6 +20897,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19740,6 +20966,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19762,6 +20990,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19937,6 +21167,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19968,6 +21201,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20000,6 +21237,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20062,6 +21303,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20082,6 +21325,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20237,6 +21482,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20268,6 +21516,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20300,6 +21552,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20362,6 +21618,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20382,6 +21640,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20533,6 +21793,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20565,6 +21828,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20599,6 +21866,10 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20664,6 +21935,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20686,6 +21959,8 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20861,6 +22136,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20893,6 +22171,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20927,6 +22209,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20992,6 +22278,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21014,6 +22302,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21189,6 +22479,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21221,6 +22514,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21255,6 +22552,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21320,6 +22621,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21342,6 +22645,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21517,6 +22822,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21549,6 +22857,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21583,6 +22895,10 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21648,6 +22964,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21670,6 +22988,8 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21845,6 +23165,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21877,6 +23200,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21911,6 +23238,10 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -21976,6 +23307,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -21998,6 +23331,8 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22169,6 +23504,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22201,6 +23539,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22235,6 +23577,10 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22300,6 +23646,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22322,6 +23670,8 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22497,6 +23847,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22529,6 +23882,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22563,6 +23920,10 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22628,6 +23989,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22650,6 +24013,8 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22825,6 +24190,9 @@ entry:
define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -22857,6 +24225,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22891,6 +24263,10 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -22956,6 +24332,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -22978,6 +24356,8 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index e1f82a70b4c0a..a88e0e217fdb4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -11,6 +11,9 @@
define amdgpu_kernel void @flat_nontemporal_load_0(
; GFX7-LABEL: flat_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -26,6 +29,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -41,6 +48,10 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
;
; GFX10-CU-LABEL: flat_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -142,6 +153,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_load_1(
; GFX7-LABEL: flat_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -172,6 +186,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_nop 0
@@ -202,6 +220,10 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
;
; GFX10-CU-LABEL: flat_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_nop 0
@@ -405,6 +427,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_0(
; GFX7-LABEL: flat_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -420,6 +445,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -435,6 +464,10 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
;
; GFX10-CU-LABEL: flat_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -540,6 +573,9 @@ entry:
define amdgpu_kernel void @flat_nontemporal_store_1(
; GFX7-LABEL: flat_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -569,6 +605,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-WGP-LABEL: flat_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -597,6 +637,10 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
;
; GFX10-CU-LABEL: flat_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -799,6 +843,9 @@ entry:
define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
; GFX7-LABEL: flat_volatile_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -814,6 +861,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
;
; GFX10-WGP-LABEL: flat_volatile_workgroup_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -829,6 +880,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
;
; GFX10-CU-LABEL: flat_volatile_workgroup_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -926,6 +981,9 @@ entry:
define amdgpu_kernel void @flat_volatile_workgroup_release_store(
; GFX7-LABEL: flat_volatile_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -938,6 +996,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
;
; GFX10-WGP-LABEL: flat_volatile_workgroup_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -951,6 +1013,10 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
;
; GFX10-CU-LABEL: flat_volatile_workgroup_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index 588f06f1be054..7c637a20ab47b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_wavefront_unordered_load(
; GFX7-LABEL: flat_wavefront_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX10-WGP-LABEL: flat_wavefront_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX10-CU-LABEL: flat_wavefront_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_load(
; GFX7-LABEL: flat_wavefront_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_load(
; GFX7-LABEL: flat_wavefront_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -363,6 +396,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -377,6 +414,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -405,6 +446,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -417,6 +460,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -516,6 +561,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_load(
; GFX7-LABEL: flat_wavefront_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -530,6 +578,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -544,6 +596,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -572,6 +628,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -584,6 +642,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -683,6 +743,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_unordered_store(
; GFX7-LABEL: flat_wavefront_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -694,6 +757,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX10-WGP-LABEL: flat_wavefront_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -705,6 +772,10 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX10-CU-LABEL: flat_wavefront_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -727,6 +798,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -737,6 +810,8 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -817,6 +892,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_store(
; GFX7-LABEL: flat_wavefront_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -828,6 +906,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -839,6 +921,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -861,6 +947,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -871,6 +959,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -951,6 +1041,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_store(
; GFX7-LABEL: flat_wavefront_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -962,6 +1055,10 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX10-WGP-LABEL: flat_wavefront_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -973,6 +1070,10 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX10-CU-LABEL: flat_wavefront_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -995,6 +1096,8 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1005,6 +1108,8 @@ define amdgpu_kernel void @flat_wavefront_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1085,6 +1190,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_store(
; GFX7-LABEL: flat_wavefront_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1096,6 +1204,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1107,6 +1219,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1129,6 +1245,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1139,6 +1257,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1219,6 +1339,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
; GFX7-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1230,6 +1353,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1241,6 +1368,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1263,6 +1394,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1273,6 +1406,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1353,6 +1488,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
; GFX7-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1364,6 +1502,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1375,6 +1517,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1397,6 +1543,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1407,6 +1555,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1487,6 +1637,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
; GFX7-LABEL: flat_wavefront_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1498,6 +1651,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1509,6 +1666,10 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1531,6 +1692,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1541,6 +1704,8 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1621,6 +1786,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
; GFX7-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1632,6 +1800,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1643,6 +1815,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1665,6 +1841,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1675,6 +1853,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1755,6 +1935,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
; GFX7-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1766,6 +1949,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1777,6 +1964,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1799,6 +1990,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1809,6 +2002,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1889,6 +2084,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1904,6 +2102,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1919,6 +2121,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1949,6 +2155,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1962,6 +2170,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2068,6 +2278,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2083,6 +2296,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2098,6 +2315,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2128,6 +2349,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2141,6 +2364,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2247,6 +2472,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2262,6 +2490,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2277,6 +2509,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2307,6 +2543,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2320,6 +2558,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2426,6 +2666,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2451,6 +2694,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2476,6 +2723,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2526,6 +2777,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2540,6 +2793,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2649,6 +2904,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2674,6 +2932,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2699,6 +2961,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2749,6 +3015,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2763,6 +3031,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2872,6 +3142,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2897,6 +3170,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2922,6 +3199,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2972,6 +3253,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2986,6 +3269,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3095,6 +3380,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3120,6 +3408,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3145,6 +3437,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3195,6 +3491,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3209,6 +3507,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3318,6 +3618,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3343,6 +3646,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3368,6 +3675,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3418,6 +3729,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3432,6 +3745,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3541,6 +3856,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3566,6 +3884,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3591,6 +3913,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3641,6 +3967,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3655,6 +3983,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3764,6 +4094,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3789,6 +4122,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3814,6 +4151,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3864,6 +4205,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3878,6 +4221,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3987,6 +4332,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4012,6 +4360,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4037,6 +4389,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4087,6 +4443,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4101,6 +4459,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4210,6 +4570,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4235,6 +4598,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4260,6 +4627,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4310,6 +4681,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4324,6 +4697,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4433,6 +4808,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4458,6 +4836,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4483,6 +4865,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4533,6 +4919,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4547,6 +4935,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4656,6 +5046,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4681,6 +5074,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4706,6 +5103,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4756,6 +5157,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4770,6 +5173,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4879,6 +5284,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4904,6 +5312,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4929,6 +5341,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4979,6 +5395,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4993,6 +5411,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5102,6 +5522,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5127,6 +5550,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5152,6 +5579,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5202,6 +5633,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5216,6 +5649,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5325,6 +5760,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5350,6 +5788,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5375,6 +5817,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5425,6 +5871,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5439,6 +5887,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5548,6 +5998,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5573,6 +6026,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5598,6 +6055,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5648,6 +6109,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5662,6 +6125,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5771,6 +6236,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -5800,6 +6268,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5829,6 +6301,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5887,6 +6363,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5904,6 +6382,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6040,6 +6520,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6069,6 +6552,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6098,6 +6585,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6156,6 +6647,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6173,6 +6666,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6309,6 +6804,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6338,6 +6836,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6367,6 +6869,10 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6425,6 +6931,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6442,6 +6950,8 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6578,6 +7088,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6607,6 +7120,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6636,6 +7153,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6694,6 +7215,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6711,6 +7234,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6847,6 +7372,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6876,6 +7404,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6905,6 +7437,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6963,6 +7499,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6980,6 +7518,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7116,6 +7656,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7145,6 +7688,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7174,6 +7721,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7232,6 +7783,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7249,6 +7802,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7385,6 +7940,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7414,6 +7972,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7443,6 +8005,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7501,6 +8067,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7518,6 +8086,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7654,6 +8224,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7683,6 +8256,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7712,6 +8289,10 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7770,6 +8351,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7787,6 +8370,8 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7923,6 +8508,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7952,6 +8540,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7981,6 +8573,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8039,6 +8635,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8056,6 +8654,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8192,6 +8792,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8221,6 +8824,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8250,6 +8857,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8308,6 +8919,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8325,6 +8938,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8461,6 +9076,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8490,6 +9108,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8519,6 +9141,10 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8577,6 +9203,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8594,6 +9222,8 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8730,6 +9360,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8759,6 +9392,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8788,6 +9425,10 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8846,6 +9487,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8863,6 +9506,8 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8999,6 +9644,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9028,6 +9676,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9057,6 +9709,10 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9115,6 +9771,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9132,6 +9790,8 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9268,6 +9928,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9297,6 +9960,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9326,6 +9993,10 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9384,6 +10055,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9401,6 +10074,8 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9537,6 +10212,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9566,6 +10244,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9595,6 +10277,10 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9653,6 +10339,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9670,6 +10358,8 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9806,6 +10496,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
; GFX7-LABEL: flat_wavefront_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9820,6 +10513,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -9834,6 +10531,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -9862,6 +10563,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9874,6 +10577,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9973,6 +10678,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9987,6 +10695,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10001,6 +10713,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10029,6 +10745,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10041,6 +10759,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10140,6 +10860,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
; GFX7-LABEL: flat_wavefront_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10154,6 +10877,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10168,6 +10895,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10196,6 +10927,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10208,6 +10941,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10307,6 +11042,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10321,6 +11059,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10335,6 +11077,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10363,6 +11109,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10375,6 +11123,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10474,6 +11224,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
; GFX7-LABEL: flat_wavefront_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10485,6 +11238,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10496,6 +11253,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10518,6 +11279,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10528,6 +11291,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10608,6 +11373,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10619,6 +11387,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10630,6 +11402,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10652,6 +11428,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10662,6 +11440,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10742,6 +11522,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_store(
; GFX7-LABEL: flat_wavefront_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10753,6 +11536,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10764,6 +11551,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10786,6 +11577,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10796,6 +11589,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10876,6 +11671,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10887,6 +11685,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10898,6 +11700,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10920,6 +11726,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10930,6 +11738,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11010,6 +11820,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11021,6 +11834,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11032,6 +11849,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11054,6 +11875,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11064,6 +11887,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11144,6 +11969,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11155,6 +11983,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11166,6 +11998,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11188,6 +12024,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11198,6 +12036,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11278,6 +12118,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11289,6 +12132,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11300,6 +12147,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11322,6 +12173,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11332,6 +12185,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11412,6 +12267,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11423,6 +12281,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11434,6 +12296,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11456,6 +12322,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11466,6 +12334,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11546,6 +12416,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11557,6 +12430,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11568,6 +12445,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11590,6 +12471,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11600,6 +12483,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11680,6 +12565,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11695,6 +12583,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11710,6 +12602,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11740,6 +12636,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11753,6 +12651,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11859,6 +12759,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11874,6 +12777,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11889,6 +12796,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11919,6 +12830,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11932,6 +12845,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12038,6 +12953,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12053,6 +12971,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12068,6 +12990,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12098,6 +13024,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12111,6 +13039,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12217,6 +13147,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12242,6 +13175,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12267,6 +13204,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12317,6 +13258,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12331,6 +13274,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12440,6 +13385,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12465,6 +13413,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12490,6 +13442,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12540,6 +13496,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12554,6 +13512,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12663,6 +13623,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12688,6 +13651,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12713,6 +13680,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12763,6 +13734,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12777,6 +13750,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12886,6 +13861,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12911,6 +13889,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12936,6 +13918,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12986,6 +13972,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13000,6 +13988,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13109,6 +14099,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13134,6 +14127,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13159,6 +14156,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13209,6 +14210,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13223,6 +14226,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13332,6 +14337,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13357,6 +14365,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13382,6 +14394,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13432,6 +14448,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13446,6 +14464,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13555,6 +14575,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13580,6 +14603,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13605,6 +14632,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13655,6 +14686,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13669,6 +14702,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13778,6 +14813,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13803,6 +14841,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13828,6 +14870,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13878,6 +14924,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13892,6 +14940,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14001,6 +15051,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14026,6 +15079,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14051,6 +15108,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14101,6 +15162,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14115,6 +15178,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14224,6 +15289,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14249,6 +15317,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14274,6 +15346,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14324,6 +15400,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14338,6 +15416,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14447,6 +15527,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14472,6 +15555,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14497,6 +15584,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14547,6 +15638,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14561,6 +15654,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14670,6 +15765,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14695,6 +15793,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14720,6 +15822,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14770,6 +15876,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14784,6 +15892,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14893,6 +16003,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14918,6 +16031,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14943,6 +16060,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14993,6 +16114,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15007,6 +16130,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15116,6 +16241,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15141,6 +16269,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15166,6 +16298,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15216,6 +16352,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15230,6 +16368,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15339,6 +16479,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15364,6 +16507,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15389,6 +16536,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15439,6 +16590,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15453,6 +16606,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15562,6 +16717,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15591,6 +16749,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15620,6 +16782,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15678,6 +16844,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15695,6 +16863,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15831,6 +17001,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15860,6 +17033,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15889,6 +17066,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -15947,6 +17128,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15964,6 +17147,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16100,6 +17285,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16129,6 +17317,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16158,6 +17350,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16216,6 +17412,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16233,6 +17431,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16369,6 +17569,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16398,6 +17601,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16427,6 +17634,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16485,6 +17696,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16502,6 +17715,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16638,6 +17853,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16667,6 +17885,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16696,6 +17918,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16754,6 +17980,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16771,6 +17999,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16907,6 +18137,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16936,6 +18169,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16965,6 +18202,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17023,6 +18264,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17040,6 +18283,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17176,6 +18421,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17205,6 +18453,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17234,6 +18486,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17292,6 +18548,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17309,6 +18567,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17445,6 +18705,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17474,6 +18737,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17503,6 +18770,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17561,6 +18832,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17578,6 +18851,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17714,6 +18989,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17743,6 +19021,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17772,6 +19054,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17830,6 +19116,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17847,6 +19135,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17983,6 +19273,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18012,6 +19305,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18041,6 +19338,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18099,6 +19400,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18116,6 +19419,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18252,6 +19557,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18281,6 +19589,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18310,6 +19622,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18368,6 +19684,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18385,6 +19703,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18521,6 +19841,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18550,6 +19873,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18579,6 +19906,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18637,6 +19968,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18654,6 +19987,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18790,6 +20125,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18819,6 +20157,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18848,6 +20190,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18906,6 +20252,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18923,6 +20271,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19059,6 +20409,9 @@ entry:
define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19088,6 +20441,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19117,6 +20474,10 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19175,6 +20536,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19192,6 +20555,8 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index ee7d79a8a8cbb..0fd4aa4a7a93f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -15,6 +15,9 @@
define amdgpu_kernel void @flat_workgroup_unordered_load(
; GFX7-LABEL: flat_workgroup_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -29,6 +32,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX10-WGP-LABEL: flat_workgroup_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -43,6 +50,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX10-CU-LABEL: flat_workgroup_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -71,6 +82,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -83,6 +96,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -182,6 +197,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_load(
; GFX7-LABEL: flat_workgroup_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -196,6 +214,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -210,6 +232,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -238,6 +264,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -250,6 +278,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -349,6 +379,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_load(
; GFX7-LABEL: flat_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -364,6 +397,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -379,6 +416,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -409,6 +450,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -422,6 +465,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -528,6 +573,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_load(
; GFX7-LABEL: flat_workgroup_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -544,6 +592,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -561,6 +613,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -593,6 +649,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -607,6 +665,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -726,6 +786,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_unordered_store(
; GFX7-LABEL: flat_workgroup_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -737,6 +800,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX10-WGP-LABEL: flat_workgroup_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -748,6 +815,10 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX10-CU-LABEL: flat_workgroup_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -770,6 +841,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -780,6 +853,8 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -860,6 +935,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_store(
; GFX7-LABEL: flat_workgroup_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -871,6 +949,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -882,6 +964,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -904,6 +990,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -914,6 +1002,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -994,6 +1084,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_store(
; GFX7-LABEL: flat_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1006,6 +1099,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX10-WGP-LABEL: flat_workgroup_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1019,6 +1116,10 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX10-CU-LABEL: flat_workgroup_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1043,6 +1144,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1054,6 +1157,8 @@ define amdgpu_kernel void @flat_workgroup_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1145,6 +1250,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_store(
; GFX7-LABEL: flat_workgroup_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1157,6 +1265,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1170,6 +1282,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1194,6 +1310,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1205,6 +1323,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1296,6 +1416,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
; GFX7-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1307,6 +1430,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1318,6 +1445,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1340,6 +1471,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1350,6 +1483,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1430,6 +1565,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
; GFX7-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1442,6 +1580,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1456,6 +1598,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1480,6 +1626,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1491,6 +1639,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1583,6 +1733,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
; GFX7-LABEL: flat_workgroup_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1595,6 +1748,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1608,6 +1765,10 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1632,6 +1793,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1643,6 +1806,8 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1734,6 +1899,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
; GFX7-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1747,6 +1915,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1763,6 +1935,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1789,6 +1965,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1801,6 +1979,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1904,6 +2084,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
; GFX7-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1917,6 +2100,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -1933,6 +2120,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -1959,6 +2150,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -1971,6 +2164,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2074,6 +2269,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2090,6 +2288,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2106,6 +2308,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2138,6 +2344,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2152,6 +2360,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2265,6 +2475,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2282,6 +2495,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2300,6 +2517,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2334,6 +2555,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2349,6 +2572,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2475,6 +2700,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2492,6 +2720,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -2510,6 +2742,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -2544,6 +2780,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2559,6 +2797,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -2685,6 +2925,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2710,6 +2953,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2735,6 +2982,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2785,6 +3036,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2799,6 +3052,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -2908,6 +3163,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2934,6 +3192,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -2962,6 +3224,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3014,6 +3280,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3029,6 +3297,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3150,6 +3420,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3176,6 +3449,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3203,6 +3480,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3255,6 +3536,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3270,6 +3553,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3390,6 +3675,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3417,6 +3705,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3447,6 +3739,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3501,6 +3797,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3517,6 +3815,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3649,6 +3949,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3676,6 +3979,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3706,6 +4013,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3760,6 +4071,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3776,6 +4089,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -3908,6 +4223,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3934,6 +4252,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -3962,6 +4284,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4014,6 +4340,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4029,6 +4357,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4150,6 +4480,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4176,6 +4509,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4204,6 +4541,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4256,6 +4597,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4271,6 +4614,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4392,6 +4737,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4419,6 +4767,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4449,6 +4801,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4503,6 +4859,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4519,6 +4877,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4651,6 +5011,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4678,6 +5041,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4708,6 +5075,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4762,6 +5133,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4778,6 +5151,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -4910,6 +5285,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4937,6 +5315,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -4967,6 +5349,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5021,6 +5407,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5037,6 +5425,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5169,6 +5559,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5196,6 +5589,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5226,6 +5623,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -5280,6 +5681,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5296,6 +5699,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5428,6 +5833,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -5457,6 +5865,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5486,6 +5898,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5544,6 +5960,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5561,6 +5979,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5697,6 +6117,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -5727,6 +6150,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5757,6 +6184,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -5817,6 +6248,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5835,6 +6268,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -5978,6 +6413,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6008,6 +6446,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6039,6 +6481,10 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6099,6 +6545,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6117,6 +6565,8 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6264,6 +6714,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6295,6 +6748,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6327,6 +6784,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6389,6 +6850,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6408,6 +6871,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6564,6 +7029,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6595,6 +7063,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6627,6 +7099,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6689,6 +7165,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6708,6 +7186,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -6864,6 +7344,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6894,6 +7377,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6924,6 +7411,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -6984,6 +7475,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7002,6 +7495,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7147,6 +7642,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7177,6 +7675,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7207,6 +7709,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7267,6 +7773,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7285,6 +7793,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7428,6 +7938,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7459,6 +7972,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7491,6 +8008,10 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7553,6 +8074,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7572,6 +8095,8 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7728,6 +8253,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7759,6 +8287,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7791,6 +8323,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -7853,6 +8389,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -7872,6 +8410,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8028,6 +8568,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8059,6 +8602,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8091,6 +8638,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8153,6 +8704,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8172,6 +8725,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8328,6 +8883,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8359,6 +8917,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8391,6 +8953,10 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8453,6 +9019,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8472,6 +9040,8 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8628,6 +9198,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8659,6 +9232,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8691,6 +9268,10 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8753,6 +9334,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8772,6 +9355,8 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -8926,6 +9511,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8957,6 +9545,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -8989,6 +9581,10 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9051,6 +9647,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9070,6 +9668,8 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9226,6 +9826,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9257,6 +9860,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9289,6 +9896,10 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9351,6 +9962,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9370,6 +9983,8 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9526,6 +10141,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9557,6 +10175,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9589,6 +10211,10 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -9651,6 +10277,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9670,6 +10298,8 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -9826,6 +10456,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
; GFX7-LABEL: flat_workgroup_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9840,6 +10473,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -9854,6 +10491,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -9882,6 +10523,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9894,6 +10537,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -9993,6 +10638,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10007,6 +10655,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10021,6 +10673,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10049,6 +10705,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10061,6 +10719,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10160,6 +10820,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
; GFX7-LABEL: flat_workgroup_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10174,6 +10837,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10190,6 +10857,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10218,6 +10889,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10230,6 +10903,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10335,6 +11010,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10349,6 +11027,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10367,6 +11049,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10395,6 +11081,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10407,6 +11095,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10522,6 +11212,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
; GFX7-LABEL: flat_workgroup_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10533,6 +11226,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_unordered_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10544,6 +11241,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_unordered_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10566,6 +11267,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10576,6 +11279,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_unordered_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10656,6 +11361,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10667,6 +11375,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10678,6 +11390,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10700,6 +11416,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10710,6 +11428,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10790,6 +11510,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_store(
; GFX7-LABEL: flat_workgroup_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10801,6 +11524,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10814,6 +11541,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10836,6 +11567,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10846,6 +11579,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10934,6 +11669,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10945,6 +11683,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -10958,6 +11700,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -10980,6 +11726,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -10990,6 +11738,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_store:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11078,6 +11828,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11089,6 +11842,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11100,6 +11857,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11122,6 +11883,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11132,6 +11895,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11212,6 +11977,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11223,6 +11991,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11236,6 +12008,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11258,6 +12034,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11268,6 +12046,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11356,6 +12136,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11367,6 +12150,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11380,6 +12167,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11402,6 +12193,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11412,6 +12205,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11500,6 +12295,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11511,6 +12309,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11526,6 +12328,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11548,6 +12354,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11558,6 +12366,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11654,6 +12464,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11665,6 +12478,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11680,6 +12497,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11702,6 +12523,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11712,6 +12535,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11808,6 +12633,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11823,6 +12651,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -11840,6 +12672,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -11870,6 +12706,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11883,6 +12721,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -11995,6 +12835,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12010,6 +12853,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12029,6 +12876,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12059,6 +12910,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12072,6 +12925,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12194,6 +13049,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12209,6 +13067,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0)
@@ -12228,6 +13090,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0)
@@ -12258,6 +13124,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12271,6 +13139,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0)
@@ -12393,6 +13263,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12418,6 +13291,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12443,6 +13320,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12493,6 +13374,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12507,6 +13390,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12616,6 +13501,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12641,6 +13529,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12668,6 +13560,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12718,6 +13614,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12732,6 +13630,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12849,6 +13749,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12874,6 +13777,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12901,6 +13808,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -12951,6 +13862,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -12965,6 +13878,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13082,6 +13997,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13107,6 +14025,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13136,6 +14058,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13186,6 +14112,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13200,6 +14128,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13325,6 +14255,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13350,6 +14283,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13379,6 +14316,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13429,6 +14370,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13443,6 +14386,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13568,6 +14513,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13593,6 +14541,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13620,6 +14572,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13670,6 +14626,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13684,6 +14642,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13801,6 +14761,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13826,6 +14789,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13853,6 +14820,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -13903,6 +14874,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -13917,6 +14890,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14034,6 +15009,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14059,6 +15037,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14088,6 +15070,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14138,6 +15124,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14152,6 +15140,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14277,6 +15267,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14302,6 +15295,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14331,6 +15328,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14381,6 +15382,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14395,6 +15398,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14520,6 +15525,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14545,6 +15553,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14574,6 +15586,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14624,6 +15640,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14638,6 +15656,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14763,6 +15783,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14788,6 +15811,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14817,6 +15844,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -14867,6 +15898,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -14881,6 +15914,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15006,6 +16041,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15031,6 +16069,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15060,6 +16102,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15110,6 +16156,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15124,6 +16172,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15249,6 +16299,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15274,6 +16327,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15303,6 +16360,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15353,6 +16414,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15367,6 +16430,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15492,6 +16557,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15517,6 +16585,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15546,6 +16618,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15596,6 +16672,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15610,6 +16688,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15735,6 +16815,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15760,6 +16843,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-WGP-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15789,6 +16876,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX10-CU-NEXT: s_load_dword s7, s[4:5], 0x8
@@ -15839,6 +16930,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15853,6 +16946,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -15978,6 +17073,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16007,6 +17105,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16036,6 +17138,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16094,6 +17200,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16111,6 +17219,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16247,6 +17357,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16276,6 +17389,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16307,6 +17424,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16365,6 +17486,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16382,6 +17505,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16524,6 +17649,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16553,6 +17681,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16584,6 +17716,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16642,6 +17778,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16659,6 +17797,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16803,6 +17943,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16832,6 +17975,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16865,6 +18012,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -16923,6 +18074,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -16940,6 +18093,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17092,6 +18247,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17121,6 +18279,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17154,6 +18316,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17212,6 +18378,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17229,6 +18397,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17381,6 +18551,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17410,6 +18583,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17441,6 +18618,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17499,6 +18680,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17516,6 +18699,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17660,6 +18845,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17689,6 +18877,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17720,6 +18912,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17778,6 +18974,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17795,6 +18993,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -17937,6 +19137,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17966,6 +19169,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -17999,6 +19206,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18057,6 +19268,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18074,6 +19287,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18226,6 +19441,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18255,6 +19473,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18288,6 +19510,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18346,6 +19572,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18363,6 +19591,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18515,6 +19745,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18544,6 +19777,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18577,6 +19814,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18635,6 +19876,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18652,6 +19895,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18804,6 +20049,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18833,6 +20081,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18866,6 +20118,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -18924,6 +20180,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -18941,6 +20199,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19093,6 +20353,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19122,6 +20385,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19155,6 +20422,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19213,6 +20484,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19230,6 +20503,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19380,6 +20655,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19409,6 +20687,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19442,6 +20724,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19500,6 +20786,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19517,6 +20805,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19669,6 +20959,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19698,6 +20991,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19731,6 +21028,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -19789,6 +21090,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19806,6 +21109,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -19958,6 +21263,9 @@ entry:
define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
; GFX7-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19987,6 +21295,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-WGP-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-WGP: ; %bb.0: ; %entry
+; GFX10-WGP-NEXT: s_add_u32 s12, s12, s17
+; GFX10-WGP-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-WGP-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-WGP-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-WGP-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20020,6 +21332,10 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX10-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX10-CU: ; %bb.0: ; %entry
+; GFX10-CU-NEXT: s_add_u32 s12, s12, s17
+; GFX10-CU-NEXT: s_addc_u32 s13, s13, 0
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12
+; GFX10-CU-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13
; GFX10-CU-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX10-CU-NEXT: s_load_dword s9, s[6:7], 0x8
@@ -20078,6 +21394,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-NOTTGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
@@ -20095,6 +21413,8 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX90A-TGSPLIT-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
+; GFX90A-TGSPLIT-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-TGSPLIT-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x8
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0xc
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index b9487f8e14c2b..8b600c835a160 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_agent_unordered_load(
;
; GFX7-LABEL: global_agent_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_agent_monotonic_load(
;
; GFX7-LABEL: global_agent_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -404,6 +410,9 @@ define amdgpu_kernel void @global_agent_acquire_load(
;
; GFX7-LABEL: global_agent_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -602,6 +611,9 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
;
; GFX7-LABEL: global_agent_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -809,6 +821,9 @@ define amdgpu_kernel void @global_agent_unordered_store(
;
; GFX7-LABEL: global_agent_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -962,6 +977,9 @@ define amdgpu_kernel void @global_agent_monotonic_store(
;
; GFX7-LABEL: global_agent_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1116,6 +1134,9 @@ define amdgpu_kernel void @global_agent_release_store(
;
; GFX7-LABEL: global_agent_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1294,6 +1315,9 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
;
; GFX7-LABEL: global_agent_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1470,6 +1494,9 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
;
; GFX7-LABEL: global_agent_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1623,6 +1650,9 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
;
; GFX7-LABEL: global_agent_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1802,6 +1832,9 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
;
; GFX7-LABEL: global_agent_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1980,6 +2013,9 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_agent_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2185,6 +2221,9 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_agent_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2390,6 +2429,9 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2587,6 +2629,9 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2812,6 +2857,9 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -3038,6 +3086,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3256,6 +3307,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3500,6 +3554,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3743,6 +3800,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4013,6 +4073,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4282,6 +4345,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4527,6 +4593,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4773,6 +4842,9 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5043,6 +5115,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5313,6 +5388,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5583,6 +5661,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5853,6 +5934,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6123,6 +6207,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6393,6 +6480,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6663,6 +6753,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6933,6 +7026,9 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7182,6 +7278,9 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7447,6 +7546,9 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7721,6 +7823,9 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8015,6 +8120,9 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8308,6 +8416,9 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8577,6 +8688,9 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8843,6 +8957,9 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9137,6 +9254,9 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9431,6 +9551,9 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9725,6 +9848,9 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10019,6 +10145,9 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10309,6 +10438,9 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10603,6 +10735,9 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10897,6 +11032,9 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -11189,6 +11327,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
;
; GFX7-LABEL: global_agent_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11370,6 +11511,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
;
; GFX7-LABEL: global_agent_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11552,6 +11696,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
;
; GFX7-LABEL: global_agent_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11750,6 +11897,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11957,6 +12107,9 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
;
; GFX7-LABEL: global_agent_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12110,6 +12263,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
;
; GFX7-LABEL: global_agent_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12264,6 +12420,9 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
;
; GFX7-LABEL: global_agent_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12442,6 +12601,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12618,6 +12780,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12771,6 +12936,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12950,6 +13118,9 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13128,6 +13299,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13333,6 +13507,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13538,6 +13715,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13735,6 +13915,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13960,6 +14143,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -14186,6 +14372,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14404,6 +14593,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14648,6 +14840,9 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14891,6 +15086,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15161,6 +15359,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15430,6 +15631,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15675,6 +15879,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15921,6 +16128,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16191,6 +16401,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16461,6 +16674,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16731,6 +16947,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17001,6 +17220,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17271,6 +17493,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17541,6 +17766,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -17811,6 +18039,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -18081,6 +18312,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18330,6 +18564,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18596,6 +18833,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18890,6 +19130,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19183,6 +19426,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19452,6 +19698,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19718,6 +19967,9 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20012,6 +20264,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20306,6 +20561,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20600,6 +20858,9 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20894,6 +21155,9 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21184,6 +21448,9 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21478,6 +21745,9 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21772,6 +22042,9 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index a6bd1b678f95e..16e55058e4fc8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -36,6 +36,9 @@ define amdgpu_kernel void @global_nontemporal_load_0(
;
; GFX7-LABEL: global_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -227,6 +230,9 @@ define amdgpu_kernel void @global_nontemporal_load_1(
;
; GFX7-LABEL: global_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -474,6 +480,9 @@ define amdgpu_kernel void @global_nontemporal_store_0(
;
; GFX7-LABEL: global_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -658,6 +667,9 @@ define amdgpu_kernel void @global_nontemporal_store_1(
;
; GFX7-LABEL: global_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -891,6 +903,9 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
;
; GFX7-LABEL: global_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index a5de6a92db1af..8042d38716107 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
;
; GFX7-LABEL: global_singlethread_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
;
; GFX7-LABEL: global_singlethread_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -403,6 +409,9 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
;
; GFX7-LABEL: global_singlethread_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -584,6 +593,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
;
; GFX7-LABEL: global_singlethread_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -758,6 +770,9 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
;
; GFX7-LABEL: global_singlethread_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -911,6 +926,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
;
; GFX7-LABEL: global_singlethread_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1064,6 +1082,9 @@ define amdgpu_kernel void @global_singlethread_release_store(
;
; GFX7-LABEL: global_singlethread_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1217,6 +1238,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
;
; GFX7-LABEL: global_singlethread_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1369,6 +1393,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
;
; GFX7-LABEL: global_singlethread_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1520,6 +1547,9 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1671,6 +1701,9 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
;
; GFX7-LABEL: global_singlethread_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1822,6 +1855,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1973,6 +2009,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_singlethread_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2126,6 +2165,9 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2306,6 +2348,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2486,6 +2531,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2669,6 +2717,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2885,6 +2936,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3101,6 +3155,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3317,6 +3374,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3533,6 +3593,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3749,6 +3812,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3965,6 +4031,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4181,6 +4250,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4397,6 +4469,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4613,6 +4688,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4829,6 +4907,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5045,6 +5126,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5261,6 +5345,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5477,6 +5564,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5693,6 +5783,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5912,6 +6005,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6160,6 +6256,9 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6408,6 +6507,9 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6656,6 +6758,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6904,6 +7009,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7152,6 +7260,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7400,6 +7511,9 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7648,6 +7762,9 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7896,6 +8013,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8144,6 +8264,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8392,6 +8515,9 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8640,6 +8766,9 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8888,6 +9017,9 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9136,6 +9268,9 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9384,6 +9519,9 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9632,6 +9770,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
;
; GFX7-LABEL: global_singlethread_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9813,6 +9954,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9994,6 +10138,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10175,6 +10322,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10349,6 +10499,9 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
;
; GFX7-LABEL: global_singlethread_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10502,6 +10655,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10655,6 +10811,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
;
; GFX7-LABEL: global_singlethread_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10808,6 +10967,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10960,6 +11122,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11111,6 +11276,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11262,6 +11430,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11413,6 +11584,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11564,6 +11738,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11717,6 +11894,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11897,6 +12077,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12077,6 +12260,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12260,6 +12446,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12476,6 +12665,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12692,6 +12884,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12908,6 +13103,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13124,6 +13322,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13340,6 +13541,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13556,6 +13760,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13772,6 +13979,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13988,6 +14198,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14204,6 +14417,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14420,6 +14636,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14636,6 +14855,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14852,6 +15074,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15068,6 +15293,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15284,6 +15512,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15503,6 +15734,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15751,6 +15985,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15999,6 +16236,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16247,6 +16487,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16495,6 +16738,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16743,6 +16989,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16991,6 +17240,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17239,6 +17491,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17487,6 +17742,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17735,6 +17993,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17983,6 +18244,9 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
;
; GFX7-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18231,6 +18495,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18479,6 +18746,9 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18727,6 +18997,9 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18975,6 +19248,9 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
;
; GFX7-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 69404247ccd6e..9c11781da56f2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_system_unordered_load(
;
; GFX7-LABEL: global_system_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_system_monotonic_load(
;
; GFX7-LABEL: global_system_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -404,6 +410,9 @@ define amdgpu_kernel void @global_system_acquire_load(
;
; GFX7-LABEL: global_system_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -604,6 +613,9 @@ define amdgpu_kernel void @global_system_seq_cst_load(
;
; GFX7-LABEL: global_system_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -813,6 +825,9 @@ define amdgpu_kernel void @global_system_unordered_store(
;
; GFX7-LABEL: global_system_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -966,6 +981,9 @@ define amdgpu_kernel void @global_system_monotonic_store(
;
; GFX7-LABEL: global_system_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1120,6 +1138,9 @@ define amdgpu_kernel void @global_system_release_store(
;
; GFX7-LABEL: global_system_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1302,6 +1323,9 @@ define amdgpu_kernel void @global_system_seq_cst_store(
;
; GFX7-LABEL: global_system_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1482,6 +1506,9 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
;
; GFX7-LABEL: global_system_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1635,6 +1662,9 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
;
; GFX7-LABEL: global_system_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1816,6 +1846,9 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
;
; GFX7-LABEL: global_system_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1998,6 +2031,9 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_system_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2209,6 +2245,9 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_system_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2420,6 +2459,9 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_system_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2619,6 +2661,9 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_system_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2850,6 +2895,9 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_system_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -3082,6 +3130,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3300,6 +3351,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3546,6 +3600,9 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3793,6 +3850,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4069,6 +4129,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4344,6 +4407,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4591,6 +4657,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4839,6 +4908,9 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5115,6 +5187,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5391,6 +5466,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5667,6 +5745,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5943,6 +6024,9 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6192,6 +6276,9 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6460,6 +6547,9 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6760,6 +6850,9 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7059,6 +7152,9 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7330,6 +7426,9 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7598,6 +7697,9 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7898,6 +8000,9 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8198,6 +8303,9 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8498,6 +8606,9 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8798,6 +8909,9 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9094,6 +9208,9 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9394,6 +9511,9 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9694,6 +9814,9 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9992,6 +10115,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
;
; GFX7-LABEL: global_system_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10173,6 +10299,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
;
; GFX7-LABEL: global_system_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10355,6 +10484,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
;
; GFX7-LABEL: global_system_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10555,6 +10687,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
;
; GFX7-LABEL: global_system_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10764,6 +10899,9 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
;
; GFX7-LABEL: global_system_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10917,6 +11055,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
;
; GFX7-LABEL: global_system_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11071,6 +11212,9 @@ define amdgpu_kernel void @global_system_one_as_release_store(
;
; GFX7-LABEL: global_system_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11253,6 +11397,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
;
; GFX7-LABEL: global_system_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11433,6 +11580,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11586,6 +11736,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11767,6 +11920,9 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11949,6 +12105,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12160,6 +12319,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12371,6 +12533,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12570,6 +12735,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12801,6 +12969,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13033,6 +13204,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13251,6 +13425,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13497,6 +13674,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13744,6 +13924,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14020,6 +14203,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14295,6 +14481,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14542,6 +14731,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14790,6 +14982,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15066,6 +15261,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15342,6 +15540,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15618,6 +15819,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15894,6 +16098,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16170,6 +16377,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16446,6 +16656,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16722,6 +16935,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16998,6 +17214,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17247,6 +17466,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17514,6 +17736,9 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17792,6 +18017,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18092,6 +18320,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18391,6 +18622,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18662,6 +18896,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18930,6 +19167,9 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19230,6 +19470,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19530,6 +19773,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19830,6 +20076,9 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20130,6 +20379,9 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20426,6 +20678,9 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20726,6 +20981,9 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -21026,6 +21284,9 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 7dfd5e60c24f8..8a5c5dda9f79c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -37,6 +37,9 @@ define amdgpu_kernel void @global_volatile_load_0(
;
; GFX7-LABEL: global_volatile_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -184,6 +187,9 @@ define amdgpu_kernel void @global_volatile_load_1(
;
; GFX7-LABEL: global_volatile_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x2
@@ -372,6 +378,9 @@ define amdgpu_kernel void @global_volatile_store_0(
;
; GFX7-LABEL: global_volatile_store_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -527,6 +536,9 @@ define amdgpu_kernel void @global_volatile_store_1(
;
; GFX7-LABEL: global_volatile_store_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -718,6 +730,9 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
;
; GFX7-LABEL: global_volatile_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -852,6 +867,9 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
;
; GFX7-LABEL: global_volatile_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index 4b6c99282dc13..151ba07a0b531 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
;
; GFX7-LABEL: global_wavefront_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
;
; GFX7-LABEL: global_wavefront_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -403,6 +409,9 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
;
; GFX7-LABEL: global_wavefront_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -584,6 +593,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
;
; GFX7-LABEL: global_wavefront_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -758,6 +770,9 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
;
; GFX7-LABEL: global_wavefront_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -911,6 +926,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
;
; GFX7-LABEL: global_wavefront_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1064,6 +1082,9 @@ define amdgpu_kernel void @global_wavefront_release_store(
;
; GFX7-LABEL: global_wavefront_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1217,6 +1238,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
;
; GFX7-LABEL: global_wavefront_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1369,6 +1393,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
;
; GFX7-LABEL: global_wavefront_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1520,6 +1547,9 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1671,6 +1701,9 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
;
; GFX7-LABEL: global_wavefront_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1822,6 +1855,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1973,6 +2009,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_wavefront_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2126,6 +2165,9 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2306,6 +2348,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2486,6 +2531,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2669,6 +2717,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -2885,6 +2936,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3101,6 +3155,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3317,6 +3374,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3533,6 +3593,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3749,6 +3812,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3965,6 +4031,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4181,6 +4250,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4397,6 +4469,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4613,6 +4688,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4829,6 +4907,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5045,6 +5126,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5261,6 +5345,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5477,6 +5564,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5693,6 +5783,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5912,6 +6005,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6160,6 +6256,9 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6408,6 +6507,9 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6656,6 +6758,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6904,6 +7009,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7152,6 +7260,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7400,6 +7511,9 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7648,6 +7762,9 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7896,6 +8013,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8144,6 +8264,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8392,6 +8515,9 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8640,6 +8766,9 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8888,6 +9017,9 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9136,6 +9268,9 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9384,6 +9519,9 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9632,6 +9770,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
;
; GFX7-LABEL: global_wavefront_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9813,6 +9954,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -9994,6 +10138,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10175,6 +10322,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10349,6 +10499,9 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
;
; GFX7-LABEL: global_wavefront_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10502,6 +10655,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10655,6 +10811,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
;
; GFX7-LABEL: global_wavefront_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10808,6 +10967,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10960,6 +11122,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11111,6 +11276,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11262,6 +11430,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11413,6 +11584,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11564,6 +11738,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11717,6 +11894,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11897,6 +12077,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12077,6 +12260,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12260,6 +12446,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12476,6 +12665,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12692,6 +12884,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -12908,6 +13103,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13124,6 +13322,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13340,6 +13541,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13556,6 +13760,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13772,6 +13979,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13988,6 +14198,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14204,6 +14417,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14420,6 +14636,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14636,6 +14855,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14852,6 +15074,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15068,6 +15293,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15284,6 +15512,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15503,6 +15734,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15751,6 +15985,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -15999,6 +16236,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16247,6 +16487,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16495,6 +16738,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16743,6 +16989,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16991,6 +17240,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17239,6 +17491,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17487,6 +17742,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17735,6 +17993,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17983,6 +18244,9 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
;
; GFX7-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18231,6 +18495,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18479,6 +18746,9 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18727,6 +18997,9 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18975,6 +19248,9 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 46d65187cb1b2..69b0c7f93ab0e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -41,6 +41,9 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
;
; GFX7-LABEL: global_workgroup_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -222,6 +225,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
;
; GFX7-LABEL: global_workgroup_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -403,6 +409,9 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
;
; GFX7-LABEL: global_workgroup_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -590,6 +599,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
;
; GFX7-LABEL: global_workgroup_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -780,6 +792,9 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
;
; GFX7-LABEL: global_workgroup_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -933,6 +948,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
;
; GFX7-LABEL: global_workgroup_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1087,6 +1105,9 @@ define amdgpu_kernel void @global_workgroup_release_store(
;
; GFX7-LABEL: global_workgroup_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1258,6 +1279,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
;
; GFX7-LABEL: global_workgroup_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1427,6 +1451,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
;
; GFX7-LABEL: global_workgroup_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1578,6 +1605,9 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1740,6 +1770,9 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
;
; GFX7-LABEL: global_workgroup_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -1909,6 +1942,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2088,6 +2124,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_workgroup_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2268,6 +2307,9 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2454,6 +2496,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2659,6 +2704,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -2866,6 +2914,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3082,6 +3133,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3309,6 +3363,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3543,6 +3600,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -3787,6 +3847,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4030,6 +4093,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4256,6 +4322,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4483,6 +4552,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4727,6 +4799,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -4971,6 +5046,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5215,6 +5293,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5459,6 +5540,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5703,6 +5787,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -5947,6 +6034,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6191,6 +6281,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -6437,6 +6530,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6685,6 +6781,9 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -6939,6 +7038,9 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7205,6 +7307,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7478,6 +7583,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -7750,6 +7858,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8005,6 +8116,9 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8259,6 +8373,9 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8532,6 +8649,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -8805,6 +8925,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9078,6 +9201,9 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9351,6 +9477,9 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9622,6 +9751,9 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -9895,6 +10027,9 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10168,6 +10303,9 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -10440,6 +10578,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
;
; GFX7-LABEL: global_workgroup_one_as_unordered_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10621,6 +10762,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10802,6 +10946,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -10988,6 +11135,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11175,6 +11325,9 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
;
; GFX7-LABEL: global_workgroup_one_as_unordered_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11328,6 +11481,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11481,6 +11637,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
;
; GFX7-LABEL: global_workgroup_one_as_release_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11644,6 +11803,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_store:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11806,6 +11968,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -11957,6 +12122,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12118,6 +12286,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_release_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12279,6 +12450,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12450,6 +12624,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12623,6 +12800,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -12808,6 +12988,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13005,6 +13188,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x2
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
@@ -13205,6 +13391,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13421,6 +13610,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13647,6 +13839,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -13873,6 +14068,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14109,6 +14307,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14345,6 +14546,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14571,6 +14775,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -14797,6 +15004,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15033,6 +15243,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15269,6 +15482,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15505,6 +15721,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15741,6 +15960,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -15977,6 +16199,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16213,6 +16438,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16449,6 +16677,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[4:5], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX7-NEXT: s_load_dword s7, s[4:5], 0x2
@@ -16688,6 +16919,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -16936,6 +17170,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17189,6 +17426,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17447,6 +17687,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17712,6 +17955,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -17977,6 +18223,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18232,6 +18481,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18485,6 +18737,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -18750,6 +19005,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19015,6 +19273,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19280,6 +19541,9 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
;
; GFX7-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19545,6 +19809,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -19808,6 +20075,9 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20073,6 +20343,9 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
@@ -20338,6 +20611,9 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
;
; GFX7-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_mov_b64 s[6:7], s[8:9]
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
; GFX7-NEXT: s_load_dword s9, s[6:7], 0x2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index 04b0f00fe77b5..78209ee34cad4 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -38,6 +38,9 @@ define amdgpu_kernel void @local_nontemporal_load_0(
;
; GFX7-LABEL: local_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 m0, -1
@@ -224,6 +227,9 @@ define amdgpu_kernel void @local_nontemporal_load_1(
;
; GFX7-LABEL: local_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 s7, 2
@@ -830,6 +836,9 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
;
; GFX7-LABEL: local_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 m0, -1
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 9e5f5fcffca9f..bc2508411ed6b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -34,6 +34,9 @@ define amdgpu_kernel void @local_volatile_load_0(
;
; GFX7-LABEL: local_volatile_load_0:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 m0, -1
@@ -172,6 +175,9 @@ define amdgpu_kernel void @local_volatile_load_1(
;
; GFX7-LABEL: local_volatile_load_1:
; GFX7: ; %bb.0: ; %entry
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
; GFX7-NEXT: s_mov_b32 s7, 2
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index fceee413f3f97..2aa4f021c259c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -38,7 +38,10 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX7-LABEL: private_nontemporal_load_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -53,7 +56,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX10-WGP-LABEL: private_nontemporal_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -67,7 +70,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX10-CU-LABEL: private_nontemporal_load_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -107,7 +110,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -121,7 +124,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -232,7 +235,10 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX7-LABEL: private_nontemporal_load_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -249,7 +255,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX10-WGP-LABEL: private_nontemporal_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -265,7 +271,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX10-CU-LABEL: private_nontemporal_load_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -309,7 +315,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_load_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -328,7 +334,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_load_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, v0
; GFX90A-TGSPLIT-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -470,7 +476,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX7-LABEL: private_nontemporal_store_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
@@ -484,7 +490,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX10-WGP-LABEL: private_nontemporal_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -498,7 +504,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX10-CU-LABEL: private_nontemporal_store_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -530,7 +536,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_0:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -544,7 +550,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_0:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -647,7 +653,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX7-LABEL: private_nontemporal_store_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2
@@ -663,7 +669,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX10-WGP-LABEL: private_nontemporal_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -678,7 +684,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX10-CU-LABEL: private_nontemporal_store_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -713,7 +719,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_store_1:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -731,7 +737,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_store_1:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -874,7 +880,10 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX7-LABEL: private_nontemporal_volatile_load:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -889,7 +898,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX10-WGP-LABEL: private_nontemporal_volatile_load:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -903,7 +912,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX10-CU-LABEL: private_nontemporal_volatile_load:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -943,7 +952,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX90A-NOTTGSPLIT-LABEL: private_nontemporal_volatile_load:
; GFX90A-NOTTGSPLIT: ; %bb.0: ; %entry
-; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -957,7 +966,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
;
; GFX90A-TGSPLIT-LABEL: private_nontemporal_volatile_load:
; GFX90A-TGSPLIT: ; %bb.0: ; %entry
-; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s15
+; GFX90A-TGSPLIT-NEXT: s_add_u32 s0, s0, s17
; GFX90A-TGSPLIT-NEXT: s_addc_u32 s1, s1, 0
; GFX90A-TGSPLIT-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index f8fb7986938f2..df4193969f8a0 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -38,7 +38,10 @@ define amdgpu_kernel void @private_volatile_load_0(
;
; GFX7-LABEL: private_volatile_load_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -53,7 +56,7 @@ define amdgpu_kernel void @private_volatile_load_0(
;
; GFX10-WGP-LABEL: private_volatile_load_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -67,7 +70,7 @@ define amdgpu_kernel void @private_volatile_load_0(
;
; GFX10-CU-LABEL: private_volatile_load_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x8
@@ -190,7 +193,10 @@ define amdgpu_kernel void @private_volatile_load_1(
;
; GFX7-LABEL: private_volatile_load_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX7-NEXT: s_add_i32 s12, s12, s17
+; GFX7-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dword s6, s[8:9], 0x0
; GFX7-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x2
@@ -207,7 +213,7 @@ define amdgpu_kernel void @private_volatile_load_1(
;
; GFX10-WGP-LABEL: private_volatile_load_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: v_mov_b32_e32 v1, v0
; GFX10-WGP-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -223,7 +229,7 @@ define amdgpu_kernel void @private_volatile_load_1(
;
; GFX10-CU-LABEL: private_volatile_load_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: v_mov_b32_e32 v1, v0
; GFX10-CU-NEXT: s_load_dword s7, s[8:9], 0x0
@@ -365,7 +371,7 @@ define amdgpu_kernel void @private_volatile_store_0(
;
; GFX7-LABEL: private_volatile_store_0:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s4, s[8:9], 0x2
@@ -380,7 +386,7 @@ define amdgpu_kernel void @private_volatile_store_0(
;
; GFX10-WGP-LABEL: private_volatile_store_0:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -395,7 +401,7 @@ define amdgpu_kernel void @private_volatile_store_0(
;
; GFX10-CU-LABEL: private_volatile_store_0:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s4, s[8:9], 0x8
@@ -515,7 +521,7 @@ define amdgpu_kernel void @private_volatile_store_1(
;
; GFX7-LABEL: private_volatile_store_1:
; GFX7: ; %bb.0: ; %entry
-; GFX7-NEXT: s_add_u32 s0, s0, s15
+; GFX7-NEXT: s_add_u32 s0, s0, s17
; GFX7-NEXT: s_addc_u32 s1, s1, 0
; GFX7-NEXT: s_load_dwordx2 s[6:7], s[8:9], 0x0
; GFX7-NEXT: s_load_dword s5, s[8:9], 0x2
@@ -532,7 +538,7 @@ define amdgpu_kernel void @private_volatile_store_1(
;
; GFX10-WGP-LABEL: private_volatile_store_1:
; GFX10-WGP: ; %bb.0: ; %entry
-; GFX10-WGP-NEXT: s_add_u32 s0, s0, s15
+; GFX10-WGP-NEXT: s_add_u32 s0, s0, s17
; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0
; GFX10-WGP-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-WGP-NEXT: s_load_dword s6, s[8:9], 0x8
@@ -548,7 +554,7 @@ define amdgpu_kernel void @private_volatile_store_1(
;
; GFX10-CU-LABEL: private_volatile_store_1:
; GFX10-CU: ; %bb.0: ; %entry
-; GFX10-CU-NEXT: s_add_u32 s0, s0, s15
+; GFX10-CU-NEXT: s_add_u32 s0, s0, s17
; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0
; GFX10-CU-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
; GFX10-CU-NEXT: s_load_dword s6, s[8:9], 0x8
diff --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index a6db7d331cef3..e43694ad2c0a7 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -34,10 +34,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -56,10 +59,13 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -144,6 +150,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_imin_sle_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -155,6 +164,9 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_imin_sle_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -214,6 +226,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
; CI-LABEL: s_test_imin_sle_v1i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -225,6 +240,9 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
; VI-LABEL: s_test_imin_sle_v1i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -288,6 +306,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s3, s3, s7
; CI-NEXT: s_min_i32 s2, s2, s6
@@ -306,6 +327,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s3, s3, s7
; VI-NEXT: s_min_i32 s2, s2, s6
@@ -414,11 +438,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i8 s2, s2
; CI-NEXT: s_sext_i32_i8 s3, s3
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_byte v[0:1], v2
@@ -429,11 +456,14 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i8 s2, s2
; VI-NEXT: s_sext_i32_i8 s3, s3
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_byte v[0:1], v2
@@ -549,6 +579,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s4, s2, 24
; CI-NEXT: s_sext_i32_i8 s5, s2
@@ -572,6 +604,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; CI-NEXT: s_and_b32 s3, s3, 0xffff
; CI-NEXT: s_or_b32 s2, s3, s2
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -582,6 +615,8 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s4, s2, 24
; VI-NEXT: s_bfe_i32 s5, s2, 0x80010
@@ -605,6 +640,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_or_b32 s2, s2, s4
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -757,6 +793,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
; CI-LABEL: s_test_imin_sle_v2i16:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s4, s2, 16
; CI-NEXT: s_sext_i32_i16 s2, s2
@@ -776,6 +815,9 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
; VI-LABEL: s_test_imin_sle_v2i16:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s4, s2, 16
; VI-NEXT: s_sext_i32_i16 s2, s2
@@ -857,6 +899,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_ashr_i32 s6, s0, 16
; CI-NEXT: s_ashr_i32 s7, s1, 16
@@ -887,6 +932,9 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_ashr_i32 s6, s1, 16
; VI-NEXT: s_sext_i32_i16 s1, s1
@@ -983,10 +1031,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1005,10 +1056,13 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1122,10 +1176,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1144,10 +1201,13 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 1, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1233,6 +1293,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_imin_slt_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1244,6 +1307,9 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_imin_slt_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1305,6 +1371,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s1, s1, s3
; CI-NEXT: s_min_i32 s0, s0, s2
@@ -1319,6 +1388,9 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s1, s1, s3
; VI-NEXT: s_min_i32 s0, s0, s2
@@ -1391,6 +1463,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, 8
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1403,6 +1478,9 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1468,6 +1546,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_i32 s2, s2, 8
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1480,6 +1561,9 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_i32 s2, s2, 8
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -1557,10 +1641,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1579,10 +1666,13 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -1686,12 +1776,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v6
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v2, s5
; CI-NEXT: v_add_i32_e32 v3, vcc, s4, v6
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; CI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; CI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
@@ -1710,12 +1803,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v6, 4, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v6
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v2, s5
; VI-NEXT: v_add_u32_e32 v3, vcc, s4, v6
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v4, vcc, 0, v2, vcc
; VI-NEXT: flat_load_dwordx3 v[0:2], v[0:1]
; VI-NEXT: flat_load_dwordx3 v[3:5], v[3:4]
@@ -1838,12 +1934,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; CI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; CI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -1874,12 +1973,15 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1]
; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3]
@@ -1976,6 +2078,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_umin_ule_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -1987,6 +2092,9 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_umin_ule_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2059,10 +2167,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v4
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: v_add_i32_e32 v2, vcc, s4, v4
; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2081,10 +2192,13 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -2188,6 +2302,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v2, s3
; CI-NEXT: v_add_i32_e32 v1, vcc, s2, v0
@@ -2209,6 +2326,9 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v2, s3
; VI-NEXT: v_add_u32_e32 v1, vcc, s2, v0
@@ -2294,6 +2414,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
; CI-LABEL: s_test_umin_ult_i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2305,6 +2428,9 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
; VI-LABEL: s_test_umin_ult_i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2386,6 +2512,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
; CI-LABEL: v_test_umin_ult_i32_multi_use:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s4, s[4:5], 0x0
; CI-NEXT: s_load_dword s5, s[6:7], 0x0
@@ -2407,6 +2536,9 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
; VI-LABEL: v_test_umin_ult_i32_multi_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_load_dword s4, s[4:5], 0x0
; VI-NEXT: s_load_dword s5, s[6:7], 0x0
@@ -2534,6 +2666,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
; CI-LABEL: v_test_umin_ult_i16_multi_use:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s4
; CI-NEXT: v_mov_b32_e32 v1, s5
@@ -2556,6 +2691,9 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
; VI-LABEL: v_test_umin_ult_i16_multi_use:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s4
; VI-NEXT: v_mov_b32_e32 v1, s5
@@ -2646,6 +2784,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
; CI-LABEL: s_test_umin_ult_v1i32:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
@@ -2657,6 +2798,9 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
; VI-LABEL: s_test_umin_ult_v1i32:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
@@ -2726,6 +2870,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
;
; CI-LABEL: s_test_umin_ult_v8i32:
; CI: ; %bb.0:
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x8
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; CI-NEXT: s_waitcnt lgkmcnt(0)
@@ -2757,6 +2904,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
;
; VI-LABEL: s_test_umin_ult_v8i32:
; VI: ; %bb.0:
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_load_dwordx16 s[12:27], s[8:9], 0x20
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; VI-NEXT: s_waitcnt lgkmcnt(0)
@@ -2921,6 +3071,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; CI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s10, s0, 16
; CI-NEXT: s_and_b32 s0, s0, 0xffff
@@ -2967,6 +3120,9 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_lshr_b32 s10, s3, 16
; VI-NEXT: s_and_b32 s3, s3, 0xffff
@@ -3088,11 +3244,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_and_b32 s2, s2, 0xffff
; CI-NEXT: s_and_b32 s3, s3, 0xffff
; CI-NEXT: s_min_u32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -3103,11 +3262,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0xffff
; VI-NEXT: s_and_b32 s3, s3, 0xffff
; VI-NEXT: s_min_u32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -3195,11 +3357,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; CI-NEXT: s_load_dword s2, s[8:9], 0xa
; CI-NEXT: s_load_dword s3, s[8:9], 0x13
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i16 s2, s2
; CI-NEXT: s_sext_i32_i16 s3, s3
; CI-NEXT: s_min_i32 s2, s2, s3
; CI-NEXT: v_mov_b32_e32 v0, s0
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v2, s2
; CI-NEXT: flat_store_dword v[0:1], v2
@@ -3210,11 +3375,14 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
; VI-NEXT: s_load_dword s2, s[8:9], 0x28
; VI-NEXT: s_load_dword s3, s[8:9], 0x4c
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s2, s2
; VI-NEXT: s_sext_i32_i16 s3, s3
; VI-NEXT: s_min_i32 s2, s2, s3
; VI-NEXT: v_mov_b32_e32 v0, s0
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v1, s1
; VI-NEXT: v_mov_b32_e32 v2, s2
; VI-NEXT: flat_store_dword v[0:1], v2
@@ -3309,6 +3477,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i16 s3, s2
; CI-NEXT: s_ashr_i32 s2, s2, 16
@@ -3323,6 +3494,9 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_sext_i32_i16 s3, s2
; VI-NEXT: s_ashr_i32 s2, s2, 16
@@ -3403,6 +3577,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3421,6 +3598,9 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3511,6 +3691,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3529,6 +3712,9 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3619,6 +3805,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3637,6 +3826,9 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3727,6 +3919,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s4
@@ -3745,6 +3940,9 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v0, s0
; VI-NEXT: v_mov_b32_e32 v1, s4
@@ -3859,9 +4057,12 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_load_dword v4, v[0:1]
@@ -3890,10 +4091,13 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
@@ -4009,9 +4213,12 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x4
; CI-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v2
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
; CI-NEXT: v_mov_b32_e32 v3, s5
; CI-NEXT: flat_load_dword v4, v[0:1]
@@ -4039,10 +4246,13 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x10
; VI-NEXT: v_lshlrev_b32_e32 v4, 2, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v4
; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: v_mov_b32_e32 v3, s5
; VI-NEXT: v_add_u32_e32 v2, vcc, s4, v4
; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
index da6120812ac1d..70ddc460b8d0a 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -180,6 +180,9 @@ define amdgpu_kernel void @v_pack_v2f16(ptr addrspace(1) %in0, ptr addrspace(1)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -260,6 +263,9 @@ define amdgpu_kernel void @v_pack_v2f16_user(ptr addrspace(1) %in0, ptr addrspac
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -341,6 +347,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_lo(ptr addrspace(1) %in1) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -403,6 +412,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(ptr addrspace(1) %in1) #0
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -465,6 +477,9 @@ define amdgpu_kernel void @v_pack_v2f16_imm_hi(ptr addrspace(1) %in0) #0 {
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -527,6 +542,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(ptr addrspace(1) %in0)
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -588,6 +606,9 @@ define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(ptr addrspace(1) %in0) #0
; GFX8: ; %bb.0:
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX8-NEXT: s_add_i32 s12, s12, s17
+; GFX8-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX8-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v1, s1
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
index 44128f0e0dcd8..acd890605bd80 100644
--- a/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -176,6 +176,9 @@ define amdgpu_kernel void @v_pack_v2i16(ptr addrspace(1) %in0, ptr addrspace(1)
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -254,6 +257,9 @@ define amdgpu_kernel void @v_pack_v2i16_user(ptr addrspace(1) %in0, ptr addrspac
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v2, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v2
@@ -333,6 +339,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_lo(ptr addrspace(1) %in1) #0 {
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -393,6 +402,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(ptr addrspace(1) %in1) #0
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -454,6 +466,9 @@ define amdgpu_kernel void @v_pack_v2i16_imm_hi(ptr addrspace(1) %in0) #0 {
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
@@ -514,6 +529,9 @@ define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(ptr addrspace(1) %in0) #0
; GFX803: ; %bb.0:
; GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GFX803-NEXT: v_lshlrev_b32_e32 v0, 2, v0
+; GFX803-NEXT: s_add_i32 s12, s12, s17
+; GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: v_mov_b32_e32 v1, s1
; GFX803-NEXT: v_add_u32_e32 v0, vcc, s0, v0
diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
index 2e9f09ad41813..7c9ecc892478c 100644
--- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
; Check that no attributes are added to graphics functions
-; RUN: opt -S -mtriple=amdgcn-amd-amdpal -amdgpu-annotate-kernel-features %s | FileCheck -check-prefixes=AKF_GCN %s
; RUN: opt -S -mtriple=amdgcn-amd-amdpal -passes=amdgpu-attributor %s | FileCheck -check-prefixes=ATTRIBUTOR_GCN %s
; Check that it doesn't crash
@@ -12,12 +11,6 @@ target datalayout = "A5"
define amdgpu_cs void @test_simple_indirect_call() {
-; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call() {
-; AKF_GCN-NEXT: [[PC:%.*]] = call i64 @llvm.amdgcn.s.getpc()
-; AKF_GCN-NEXT: [[FUN:%.*]] = inttoptr i64 [[PC]] to ptr
-; AKF_GCN-NEXT: call amdgpu_gfx void [[FUN]]()
-; AKF_GCN-NEXT: ret void
-;
; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call
; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] {
; ATTRIBUTOR_GCN-NEXT: [[PC:%.*]] = call i64 @llvm.amdgcn.s.getpc()
@@ -68,7 +61,6 @@ declare i64 @llvm.amdgcn.s.getpc() #0
attributes #0 = { nounwind readnone speculatable willreturn }
;.
-; AKF_GCN: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "uniform-work-group-size"="false" }
; ATTRIBUTOR_GCN: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
index b40d35dbd8ac6..ec2b7751a5514 100644
--- a/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/partial-regcopy-and-spill-missed-at-regalloc.ll
@@ -10,32 +10,32 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX908: bb.0 (%ir-block.0):
; REGALLOC-GFX908-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX908-NEXT: {{ $}}
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %6
- ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %7
- ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def %7
+ ; REGALLOC-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def %8
+ ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; REGALLOC-GFX908-NEXT: [[COPY:%[0-9]+]]:areg_128 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; REGALLOC-GFX908-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
; REGALLOC-GFX908-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
- ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
+ ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: [[COPY3:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
- ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+ ; REGALLOC-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64, [[COPY3]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX908-NEXT: S_ENDPGM 0
;
; PEI-GFX908-LABEL: name: partial_copy
; PEI-GFX908: bb.0 (%ir-block.0):
- ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
+ ; PEI-GFX908-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
; PEI-GFX908-NEXT: {{ $}}
- ; PEI-GFX908-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
- ; PEI-GFX908-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
- ; PEI-GFX908-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+ ; PEI-GFX908-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+ ; PEI-GFX908-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+ ; PEI-GFX908-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX908-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6291466 /* regdef:VReg_128 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX908-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3538954 /* regdef:VReg_64 */, def renamable $vgpr0_vgpr1
- ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+ ; PEI-GFX908-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; PEI-GFX908-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
@@ -44,7 +44,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX908-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PEI-GFX908-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
; PEI-GFX908-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+ ; PEI-GFX908-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; PEI-GFX908-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
; PEI-GFX908-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
; PEI-GFX908-NEXT: renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec
@@ -55,31 +55,31 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; REGALLOC-GFX90A: bb.0 (%ir-block.0):
; REGALLOC-GFX90A-NEXT: liveins: $sgpr4_sgpr5
; REGALLOC-GFX90A-NEXT: {{ $}}
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %5:agpr_32
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %6
- ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %7
- ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %14:vreg_64_align2, %6, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef %6:agpr_32
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def %7
+ ; REGALLOC-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def %8
+ ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %15:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
; REGALLOC-GFX90A-NEXT: [[COPY:%[0-9]+]]:areg_128_align2 = COPY killed renamable $sgpr0_sgpr1_sgpr2_sgpr3
; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; REGALLOC-GFX90A-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
; REGALLOC-GFX90A-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128_align2 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
- ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %16:vreg_64_align2, %7, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
- ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %18:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+ ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef %17:vreg_64_align2, %8, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
+ ; REGALLOC-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef %19:vreg_64_align2, [[V_MFMA_I32_4X4X4I8_e64_]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; REGALLOC-GFX90A-NEXT: S_ENDPGM 0
;
; PEI-GFX90A-LABEL: name: partial_copy
; PEI-GFX90A: bb.0 (%ir-block.0):
- ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr7
+ ; PEI-GFX90A-NEXT: liveins: $agpr4, $sgpr4_sgpr5, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr9
; PEI-GFX90A-NEXT: {{ $}}
- ; PEI-GFX90A-NEXT: $sgpr8_sgpr9_sgpr10_sgpr11 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
- ; PEI-GFX90A-NEXT: $sgpr8 = S_ADD_U32 $sgpr8, $sgpr7, implicit-def $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
- ; PEI-GFX90A-NEXT: $sgpr9 = S_ADDC_U32 $sgpr9, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11
+ ; PEI-GFX90A-NEXT: $sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3
+ ; PEI-GFX90A-NEXT: $sgpr12 = S_ADD_U32 $sgpr12, $sgpr9, implicit-def $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
+ ; PEI-GFX90A-NEXT: $sgpr13 = S_ADDC_U32 $sgpr13, 0, implicit-def dead $scc, implicit $scc, implicit-def $sgpr12_sgpr13_sgpr14_sgpr15
; PEI-GFX90A-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 2162697 /* reguse:AGPR_32 */, undef renamable $agpr0
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 6619146 /* regdef:VReg_128_Align2 */, def renamable $vgpr0_vgpr1_vgpr2_vgpr3
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = COPY killed renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; PEI-GFX90A-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 3866634 /* regdef:VReg_64_Align2 */, def renamable $vgpr0_vgpr1
- ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
+ ; PEI-GFX90A-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr0_vgpr1 :: (store (s32) into %stack.0, addrspace 5)
; PEI-GFX90A-NEXT: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
; PEI-GFX90A-NEXT: renamable $sgpr0_sgpr1_sgpr2_sgpr3 = S_LOAD_DWORDX4_IMM killed renamable $sgpr4_sgpr5, 0, 0 :: (dereferenceable invariant load (s128) from %ir.arg.kernarg.offset1, addrspace 4)
@@ -87,7 +87,7 @@ define amdgpu_kernel void @partial_copy(<4 x i32> %arg) #0 {
; PEI-GFX90A-NEXT: renamable $vgpr0 = V_MOV_B32_e32 1, implicit $exec
; PEI-GFX90A-NEXT: renamable $vgpr1 = V_MOV_B32_e32 2, implicit $exec
; PEI-GFX90A-NEXT: renamable $agpr0_agpr1_agpr2_agpr3 = V_MFMA_I32_4X4X4I8_e64 killed $vgpr0, killed $vgpr1, killed $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
- ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
+ ; PEI-GFX90A-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr12_sgpr13_sgpr14_sgpr15, 0, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1 :: (load (s32) from %stack.0, addrspace 5)
; PEI-GFX90A-NEXT: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $vgpr0_vgpr1
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX2 undef renamable $vgpr0_vgpr1, killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec :: (volatile store (s64) into `ptr addrspace(1) undef`, addrspace 1)
; PEI-GFX90A-NEXT: GLOBAL_STORE_DWORDX4 undef renamable $vgpr0_vgpr1, killed renamable $agpr0_agpr1_agpr2_agpr3, 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index 56523ea9761cd..2bac756d88ac3 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -19,16 +19,16 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preload_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB0_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB0_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -54,17 +54,16 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
;
; GFX90a-LABEL: preload_unused_arg_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB1_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB1_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s12
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -90,7 +89,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: no_free_sgprs_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB2_0
; GFX90a-NEXT: .p2align 8
@@ -100,7 +99,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[12:13]
+; GFX90a-NEXT: global_store_dword v0, v1, s[14:15]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -181,7 +180,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
;
; GFX90a-LABEL: incorrect_type_i64_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB5_0
; GFX90a-NEXT: .p2align 8
@@ -191,7 +190,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i64, ptr addrspace(4) %imp_arg_ptr
@@ -217,7 +216,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
;
; GFX90a-LABEL: incorrect_type_i16_block_count_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB6_0
; GFX90a-NEXT: .p2align 8
@@ -227,7 +226,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i16, ptr addrspace(4) %imp_arg_ptr
@@ -252,16 +251,15 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preload_block_count_y:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB7_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB7_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 4
@@ -289,7 +287,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: random_incorrect_offset:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB8_0
; GFX90a-NEXT: .p2align 8
@@ -300,7 +298,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
@@ -327,17 +325,16 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preload_block_count_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB9_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB9_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s10
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s12
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
@@ -366,19 +363,18 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
;
; GFX90a-LABEL: preload_block_count_x_imparg_align_ptr_i8:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB10_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB10_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
-; GFX90a-NEXT: s_add_i32 s0, s10, s0
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
+; GFX90a-NEXT: s_add_i32 s0, s12, s0
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -408,19 +404,18 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_block_count_xyz:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB11_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB11_0:
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 0
@@ -454,17 +449,17 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_workgroup_size_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB12_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB12_0:
-; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
@@ -492,17 +487,17 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_workgroup_size_y:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB13_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB13_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s11, 16
+; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 14
@@ -531,18 +526,18 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out)
;
; GFX90a-LABEL: preload_workgroup_size_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB14_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB14_0:
-; GFX90a-NEXT: s_and_b32 s0, s12, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s14, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 16
@@ -575,22 +570,22 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
;
; GFX90a-LABEL: preload_workgroup_size_xyz:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB15_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB15_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s11, 16
-; GFX90a-NEXT: s_and_b32 s1, s11, 0xffff
-; GFX90a-NEXT: s_and_b32 s2, s12, 0xffff
+; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-NEXT: s_and_b32 s1, s13, 0xffff
+; GFX90a-NEXT: s_and_b32 s2, s14, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s2
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 12
@@ -628,18 +623,18 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-LABEL: preload_remainder_x:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB16_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB16_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s12, 16
+; GFX90a-NEXT: s_lshr_b32 s0, s14, 16
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
@@ -668,18 +663,16 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-LABEL: preloadremainder_y:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB17_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB17_0:
-; GFX90a-NEXT: s_and_b32 s0, s13, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s15, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 20
@@ -708,18 +701,16 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
;
; GFX90a-LABEL: preloadremainder_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB18_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB18_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
+; GFX90a-NEXT: s_lshr_b32 s0, s15, 16
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
@@ -752,22 +743,20 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
;
; GFX90a-LABEL: preloadremainder_xyz:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB19_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB19_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT: s_lshr_b32 s1, s12, 16
-; GFX90a-NEXT: s_and_b32 s2, s13, 0xffff
+; GFX90a-NEXT: s_lshr_b32 s0, s15, 16
+; GFX90a-NEXT: s_lshr_b32 s1, s14, 16
+; GFX90a-NEXT: s_and_b32 s2, s15, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-NEXT: v_mov_b32_e32 v2, s0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep_x = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 18
@@ -803,7 +792,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
;
; GFX90a-LABEL: no_free_sgprs_preloadremainder_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[8:9], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[14:15], s[8:9], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB20_0
; GFX90a-NEXT: .p2align 8
@@ -814,7 +803,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_lshr_b32 s0, s0, 16
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[12:13]
+; GFX90a-NEXT: global_store_dword v0, v1, s[14:15]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 22
@@ -844,10 +833,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
;
; GFX90a-LABEL: preload_block_max_user_sgprs:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
-; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x20
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB21_0
; GFX90a-NEXT: .p2align 8
@@ -857,7 +843,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%load = load i32, ptr addrspace(4) %imp_arg_ptr
@@ -887,21 +873,23 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt
;
; GFX90a-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dword s14, s[4:5], 0x18
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB22_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB22_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s13, 16
-; GFX90a-NEXT: s_and_b32 s1, s12, 0xffff
+; GFX90a-NEXT: s_load_dword s0, s[4:5], 0x1c
+; GFX90a-NEXT: s_and_b32 s1, s14, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: s_lshr_b32 s0, s0, 16
; GFX90a-NEXT: v_mov_b32_e32 v2, s0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep0 = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 8
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index 436116f3f72a5..7c6d56bd87d01 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -21,17 +21,17 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0)
;
; GFX90a-LABEL: ptr1_i8:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB0_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB0_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
@@ -56,17 +56,17 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero
;
; GFX90a-LABEL: ptr1_i8_zext_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB1_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB1_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
@@ -91,17 +91,17 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16
;
; GFX90a-LABEL: ptr1_i16_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB2_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB2_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out, align 4
@@ -125,16 +125,16 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32
;
; GFX90a-LABEL: ptr1_i32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB3_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB3_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store i32 %arg0, ptr addrspace(1) %out
ret void
@@ -160,18 +160,17 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspa
;
; GFX90a-LABEL: i32_ptr1_i32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s12, s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB4_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB4_0:
-; GFX90a-NEXT: s_add_i32 s0, s6, s10
+; GFX90a-NEXT: s_add_i32 s0, s8, s12
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_dword v0, v1, s[10:11]
; GFX90a-NEXT: s_endpgm
%add = add i32 %arg0, %arg1
store i32 %add, ptr addrspace(1) %out
@@ -198,19 +197,19 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: ptr1_i16_i16_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB5_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB5_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s8, 16
-; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX90a-NEXT: s_lshr_b32 s0, s10, 16
+; GFX90a-NEXT: s_and_b32 s1, s10, 0xffff
; GFX90a-NEXT: s_add_i32 s0, s1, s0
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i16 %arg0 to i32
%ext1 = zext i16 %arg1 to i32
@@ -236,16 +235,16 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2
;
; GFX90a-LABEL: ptr1_v2i8_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB6_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB6_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <2 x i8> %in, ptr addrspace(1) %out
ret void
@@ -274,7 +273,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
;
; GFX90a-LABEL: byref_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB7_0
; GFX90a-NEXT: .p2align 8
@@ -285,9 +284,9 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v2, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
@@ -320,7 +319,7 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: byref_staggered_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB8_0
; GFX90a-NEXT: .p2align 8
@@ -331,9 +330,9 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, s1
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
-; GFX90a-NEXT: global_store_dword v0, v2, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v2, s[8:9]
; GFX90a-NEXT: s_waitcnt vmcnt(0)
; GFX90a-NEXT: s_endpgm
%in = load i32, ptr addrspace(4) %in.byref
@@ -370,26 +369,26 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x
;
; GFX90a-LABEL: v8i32_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB9_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB9_0:
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX90a-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x20
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v0, s16
+; GFX90a-NEXT: v_mov_b32_e32 v1, s17
+; GFX90a-NEXT: v_mov_b32_e32 v2, s18
+; GFX90a-NEXT: v_mov_b32_e32 v3, s19
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
+; GFX90a-NEXT: s_nop 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s12
; GFX90a-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-NEXT: s_nop 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-NEXT: s_endpgm
store <8 x i32> %in, ptr addrspace(1) %out, align 4
ret void
@@ -414,18 +413,17 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %o
;
; GFX90a-LABEL: v3i16_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB10_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB10_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <3 x i16> %in, ptr addrspace(1) %out, align 4
ret void
@@ -451,19 +449,17 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %o
;
; GFX90a-LABEL: v3i32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB11_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB11_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
store <3 x i32> %in, ptr addrspace(1) %out, align 4
ret void
@@ -489,19 +485,17 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o
;
; GFX90a-LABEL: v3f32_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB12_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB12_0:
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
store <3 x float> %in, ptr addrspace(1) %out, align 4
ret void
@@ -533,25 +527,24 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou
;
; GFX90a-LABEL: v5i8_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB13_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB13_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s8, 24
+; GFX90a-NEXT: s_lshr_b32 s1, s10, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s2, s10, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s0, s0, s1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_byte v0, v1, s[6:7] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_byte v0, v1, s[8:9] offset:4
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <5 x i8> %in, ptr addrspace(1) %out, align 4
ret void
@@ -587,29 +580,29 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x
;
; GFX90a-LABEL: v5f64_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB14_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB14_0:
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
+; GFX90a-NEXT: s_load_dwordx8 s[12:19], s[4:5], 0x40
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_pk_mov_b32 v[2:3], s[0:1], s[0:1] op_sel:[0,1]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s16
+; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[8:9] offset:32
+; GFX90a-NEXT: v_mov_b32_e32 v1, s17
+; GFX90a-NEXT: v_mov_b32_e32 v2, s18
+; GFX90a-NEXT: v_mov_b32_e32 v3, s19
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9] offset:16
+; GFX90a-NEXT: s_nop 0
; GFX90a-NEXT: v_mov_b32_e32 v0, s12
-; GFX90a-NEXT: global_store_dwordx2 v4, v[2:3], s[6:7] offset:32
; GFX90a-NEXT: v_mov_b32_e32 v1, s13
; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, s15
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7] offset:16
-; GFX90a-NEXT: s_nop 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: v_mov_b32_e32 v2, s10
-; GFX90a-NEXT: v_mov_b32_e32 v3, s11
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-NEXT: s_endpgm
store <5 x double> %in, ptr addrspace(1) %out, align 8
ret void
@@ -647,31 +640,30 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8
;
; GFX90a-LABEL: v8i8_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB15_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB15_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s9, 24
+; GFX90a-NEXT: s_lshr_b32 s1, s11, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s9, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s2, s11, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_lshr_b32 s2, s8, 24
+; GFX90a-NEXT: s_lshr_b32 s2, s10, 24
; GFX90a-NEXT: s_lshl_b32 s2, s2, 8
-; GFX90a-NEXT: s_bfe_u32 s3, s8, 0x80010
-; GFX90a-NEXT: s_and_b32 s0, s9, 0xffff
+; GFX90a-NEXT: s_bfe_u32 s3, s10, 0x80010
+; GFX90a-NEXT: s_and_b32 s0, s11, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s2, s3, s2
; GFX90a-NEXT: s_or_b32 s0, s0, s1
-; GFX90a-NEXT: s_and_b32 s1, s8, 0xffff
+; GFX90a-NEXT: s_and_b32 s1, s10, 0xffff
; GFX90a-NEXT: s_lshl_b32 s2, s2, 16
; GFX90a-NEXT: s_or_b32 s1, s1, s2
; GFX90a-NEXT: v_mov_b32_e32 v0, s1
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
store <8 x i8> %in, ptr addrspace(1) %out
ret void
@@ -694,16 +686,15 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i
;
; GFX90a-LABEL: i64_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB16_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB16_0:
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
store i64 %a, ptr addrspace(1) %out, align 8
ret void
@@ -726,16 +717,15 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, d
;
; GFX90a-LABEL: f64_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB17_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB17_0:
; GFX90a-NEXT: v_mov_b32_e32 v2, 0
-; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
-; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX90a-NEXT: v_pk_mov_b32 v[0:1], s[10:11], s[10:11] op_sel:[0,1]
+; GFX90a-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9]
; GFX90a-NEXT: s_endpgm
store double %in, ptr addrspace(1) %out
ret void
@@ -758,16 +748,16 @@ define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: half_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB18_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB18_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
ret void
@@ -790,16 +780,16 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out
;
; GFX90a-LABEL: bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB19_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB19_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store bfloat %in, ptr addrspace(1) %out
ret void
@@ -822,16 +812,16 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: v2bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB20_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB20_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <2 x bfloat> %in, ptr addrspace(1) %out
ret void
@@ -856,18 +846,17 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: v3bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB21_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB21_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <3 x bfloat> %in, ptr addrspace(1) %out
ret void
@@ -893,19 +882,17 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: v6bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB22_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB22_0:
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
store <6 x bfloat> %in, ptr addrspace(1) %out
ret void
@@ -934,24 +921,24 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr
;
; GFX90a-LABEL: half_v7bfloat_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB23_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB23_0:
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x20
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s8
-; GFX90a-NEXT: global_store_short v3, v0, s[6:7]
-; GFX90a-NEXT: v_mov_b32_e32 v0, s13
-; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: global_store_short v3, v0, s[0:1] offset:12
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NEXT: global_store_short v3, v0, s[8:9]
+; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90a-NEXT: v_mov_b32_e32 v0, s3
+; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12
+; GFX90a-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
; GFX90a-NEXT: s_endpgm
store half %in, ptr addrspace(1) %out
store <7 x bfloat> %in2, ptr addrspace(1) %out2
@@ -976,17 +963,17 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1
;
; GFX90a-LABEL: i1_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s10, s[4:5], 0x8
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB24_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB24_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 1
+; GFX90a-NEXT: s_and_b32 s0, s10, 1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_byte v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_byte v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store i1 %in, ptr addrspace(1) %out
ret void
@@ -1013,20 +1000,18 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: fp128_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB25_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB25_0:
; GFX90a-NEXT: v_mov_b32_e32 v4, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-NEXT: v_mov_b32_e32 v3, s13
-; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: v_mov_b32_e32 v3, s15
+; GFX90a-NEXT: global_store_dwordx4 v4, v[0:3], s[8:9]
; GFX90a-NEXT: s_endpgm
store fp128 %in, ptr addrspace(1) %out
ret void
@@ -1059,26 +1044,25 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: v7i8_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB26_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB26_0:
-; GFX90a-NEXT: s_lshr_b32 s1, s8, 24
+; GFX90a-NEXT: s_lshr_b32 s1, s10, 24
; GFX90a-NEXT: s_lshl_b32 s1, s1, 8
-; GFX90a-NEXT: s_bfe_u32 s2, s8, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s2, s10, 0x80010
; GFX90a-NEXT: s_or_b32 s1, s2, s1
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xffff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xffff
; GFX90a-NEXT: s_lshl_b32 s1, s1, 16
; GFX90a-NEXT: s_or_b32 s0, s0, s1
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[6:7] offset:6
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7] offset:4
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_byte_d16_hi v0, v1, s[8:9] offset:6
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9] offset:4
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
store <7 x i8> %in, ptr addrspace(1) %out
ret void
@@ -1106,21 +1090,19 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out
;
; GFX90a-LABEL: v7half_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
-; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x18
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB27_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB27_0:
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v0, s13
-; GFX90a-NEXT: global_store_short v3, v0, s[6:7] offset:12
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s15
+; GFX90a-NEXT: global_store_short v3, v0, s[8:9] offset:12
+; GFX90a-NEXT: v_mov_b32_e32 v2, s14
+; GFX90a-NEXT: v_mov_b32_e32 v0, s12
+; GFX90a-NEXT: v_mov_b32_e32 v1, s13
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[8:9]
; GFX90a-NEXT: s_endpgm
store <7 x half> %in, ptr addrspace(1) %out
ret void
@@ -1145,18 +1127,18 @@ define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %ou
;
; GFX90a-LABEL: i16_i32_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB28_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB28_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
-; GFX90a-NEXT: v_mov_b32_e32 v1, s9
-; GFX90a-NEXT: global_store_dword v0, v1, s[10:11]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s11
+; GFX90a-NEXT: global_store_dword v0, v1, s[12:13]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i32 %in2, ptr addrspace(1) %out2
@@ -1184,22 +1166,22 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %
;
; GFX90a-LABEL: i16_v3i32_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB29_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB29_0:
-; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x20
+; GFX90a-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
; GFX90a-NEXT: v_mov_b32_e32 v3, 0
-; GFX90a-NEXT: v_mov_b32_e32 v4, s8
-; GFX90a-NEXT: v_mov_b32_e32 v0, s10
-; GFX90a-NEXT: v_mov_b32_e32 v1, s11
-; GFX90a-NEXT: v_mov_b32_e32 v2, s12
-; GFX90a-NEXT: global_store_short v3, v4, s[6:7]
+; GFX90a-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x20
+; GFX90a-NEXT: v_mov_b32_e32 v4, s10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[0:1]
+; GFX90a-NEXT: v_mov_b32_e32 v0, s0
+; GFX90a-NEXT: v_mov_b32_e32 v1, s1
+; GFX90a-NEXT: v_mov_b32_e32 v2, s2
+; GFX90a-NEXT: global_store_short v3, v4, s[8:9]
+; GFX90a-NEXT: global_store_dwordx3 v3, v[0:2], s[4:5]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <3 x i32> %in2, ptr addrspace(1) %out2
@@ -1224,17 +1206,17 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %ou
;
; GFX90a-LABEL: i16_i16_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB30_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB30_0:
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
-; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[10:11]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
+; GFX90a-NEXT: global_store_short_d16_hi v0, v1, s[12:13]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store i16 %in2, ptr addrspace(1) %out2
@@ -1264,22 +1246,22 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o
;
; GFX90a-LABEL: i16_v2i8_kernel_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
+; GFX90a-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0x10
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB31_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB31_0:
-; GFX90a-NEXT: s_lshr_b32 s0, s8, 24
+; GFX90a-NEXT: s_lshr_b32 s0, s10, 24
; GFX90a-NEXT: s_lshl_b32 s0, s0, 8
-; GFX90a-NEXT: s_bfe_u32 s1, s8, 0x80010
+; GFX90a-NEXT: s_bfe_u32 s1, s10, 0x80010
; GFX90a-NEXT: s_or_b32 s0, s1, s0
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
-; GFX90a-NEXT: v_mov_b32_e32 v1, s8
-; GFX90a-NEXT: global_store_short v0, v1, s[6:7]
+; GFX90a-NEXT: v_mov_b32_e32 v1, s10
+; GFX90a-NEXT: global_store_short v0, v1, s[8:9]
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_short v0, v1, s[10:11]
+; GFX90a-NEXT: global_store_short v0, v1, s[12:13]
; GFX90a-NEXT: s_endpgm
store i16 %in, ptr addrspace(1) %out
store <2 x i8> %in2, ptr addrspace(1) %out2
@@ -1308,7 +1290,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
;
; GFX90a-LABEL: i32_ptr1_i32_staggered_preload_arg:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dword s6, s[4:5], 0x0
+; GFX90a-NEXT: s_load_dword s8, s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB32_0
; GFX90a-NEXT: .p2align 8
@@ -1318,7 +1300,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
; GFX90a-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90a-NEXT: s_add_i32 s2, s6, s2
+; GFX90a-NEXT: s_add_i32 s2, s8, s2
; GFX90a-NEXT: v_mov_b32_e32 v1, s2
; GFX90a-NEXT: global_store_dword v0, v1, s[0:1]
; GFX90a-NEXT: s_endpgm
@@ -1345,17 +1327,16 @@ define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out,
;
; GFX90a-LABEL: ptr1_i8_trailing_unused:
; GFX90a: ; %bb.1:
-; GFX90a-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0
-; GFX90a-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x8
+; GFX90a-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: s_branch .LBB33_0
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB33_0:
-; GFX90a-NEXT: s_and_b32 s0, s8, 0xff
+; GFX90a-NEXT: s_and_b32 s0, s10, 0xff
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
-; GFX90a-NEXT: global_store_dword v0, v1, s[6:7]
+; GFX90a-NEXT: global_store_dword v0, v1, s[8:9]
; GFX90a-NEXT: s_endpgm
%ext = zext i8 %arg0 to i32
store i32 %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/sad.ll b/llvm/test/CodeGen/AMDGPU/sad.ll
index 2b47095c6cf14..2c720399f6c67 100644
--- a/llvm/test/CodeGen/AMDGPU/sad.ll
+++ b/llvm/test/CodeGen/AMDGPU/sad.ll
@@ -6,6 +6,9 @@ define amdgpu_kernel void @v_sad_u32_pat1(ptr addrspace(1) %out, i32 %a, i32 %b,
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
@@ -33,9 +36,12 @@ define amdgpu_kernel void @v_sad_u32_constant_pat1(ptr addrspace(1) %out, i32 %a
; GCN-NEXT: s_load_dword s2, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, 0x5a
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_sad_u32 v2, s2, v0, 20
; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -57,6 +63,9 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b,
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
@@ -79,12 +88,14 @@ define amdgpu_kernel void @v_sad_u32_pat2(ptr addrspace(1) %out, i32 %a, i32 %b,
define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_sub_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_min_u32 s3, s0, s1
; GCN-NEXT: s_max_u32 s0, s0, s1
@@ -92,8 +103,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: s_add_i32 s0, s0, s2
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
@@ -115,19 +127,22 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(ptr addrspace(1) %out, i
define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_add_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s1
; GCN-NEXT: v_mov_b32_e32 v3, s2
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_sad_u32 v2, s0, v2, v3
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: flat_store_dword v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -147,21 +162,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(ptr addrspace(1) %out, i
define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_max_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_max_u32 s3, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_sad_u32 v3, s0, v0, v1
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_store_dword v[0:1], v3
; GCN-NEXT: s_endpgm
@@ -182,21 +200,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(ptr addrspace(1) %out, i
define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_min_pat1:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_min_u32 s3, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_sad_u32 v3, s0, v0, v1
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_store_dword v[0:1], v3
; GCN-NEXT: s_endpgm
@@ -218,21 +239,24 @@ define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(ptr addrspace(1) %out, i
define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_sub_pat2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s3, s0, s1
; GCN-NEXT: v_mov_b32_e32 v0, s1
; GCN-NEXT: v_mov_b32_e32 v1, s2
; GCN-NEXT: v_mov_b32_e32 v2, s3
; GCN-NEXT: v_sad_u32 v3, s0, v0, v1
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: flat_store_dword v[0:1], v3
; GCN-NEXT: s_endpgm
@@ -251,12 +275,14 @@ define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(ptr addrspace(1) %out, i
define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c) {
; GCN-LABEL: v_sad_u32_multi_use_select_pat2:
; GCN: ; %bb.0:
-; GCN-NEXT: s_mov_b64 s[18:19], s[2:3]
-; GCN-NEXT: s_mov_b64 s[16:17], s[0:1]
+; GCN-NEXT: s_mov_b64 s[22:23], s[2:3]
+; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
-; GCN-NEXT: s_add_u32 s16, s16, s15
-; GCN-NEXT: s_addc_u32 s17, s17, 0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_add_u32 s20, s20, s17
+; GCN-NEXT: s_addc_u32 s21, s21, 0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_min_u32 s3, s0, s1
; GCN-NEXT: s_max_u32 s0, s0, s1
@@ -264,8 +290,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
; GCN-NEXT: v_mov_b32_e32 v0, s4
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: s_add_i32 s0, s0, s2
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: v_mov_b32_e32 v1, s5
-; GCN-NEXT: buffer_store_dword v2, v0, s[16:19], 0 offen
+; GCN-NEXT: buffer_store_dword v2, v0, s[20:23], 0 offen
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s0
; GCN-NEXT: flat_store_dword v[0:1], v2
@@ -285,6 +312,9 @@ define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(ptr addrspace(1) %out
define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; GCN-LABEL: v_sad_u32_vector_pat1:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; GCN-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0xc
; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
@@ -321,6 +351,9 @@ define amdgpu_kernel void @v_sad_u32_vector_pat1(ptr addrspace(1) %out, <4 x i32
define amdgpu_kernel void @v_sad_u32_vector_pat2(ptr addrspace(1) %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
; GCN-LABEL: v_sad_u32_vector_pat2:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x4
; GCN-NEXT: s_load_dwordx4 s[12:15], s[8:9], 0xc
; GCN-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
@@ -358,6 +391,8 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
; GCN-NEXT: s_load_dword s4, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s4, s4, 0xffff
; GCN-NEXT: s_lshr_b32 s0, s0, 16
@@ -365,6 +400,7 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
; GCN-NEXT: v_mov_b32_e32 v1, s0
; GCN-NEXT: v_sad_u32 v2, s4, v1, v0
; GCN-NEXT: v_mov_b32_e32 v0, s2
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v1, s3
; GCN-NEXT: flat_store_short v[0:1], v2
; GCN-NEXT: s_endpgm
@@ -384,6 +420,9 @@ define amdgpu_kernel void @v_sad_u32_i16_pat1(ptr addrspace(1) %out, i16 %a, i16
define amdgpu_kernel void @v_sad_u32_i16_pat2(ptr addrspace(1) %out) {
; GCN-LABEL: v_sad_u32_i16_pat2:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: flat_load_ushort v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -416,6 +455,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s3, s2, 0xff
; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008
@@ -443,6 +485,9 @@ define amdgpu_kernel void @v_sad_u32_i8_pat1(ptr addrspace(1) %out, i8 %a, i8 %b
define amdgpu_kernel void @v_sad_u32_i8_pat2(ptr addrspace(1) %out) {
; GCN-LABEL: v_sad_u32_i8_pat2:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: flat_load_ubyte v0, v[0:1] glc
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
@@ -475,6 +520,9 @@ define amdgpu_kernel void @s_sad_u32_i8_pat2(ptr addrspace(1) %out, i8 zeroext %
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s2, s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_and_b32 s3, s2, 0xff
; GCN-NEXT: s_bfe_u32 s4, s2, 0x80008
@@ -502,6 +550,9 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(ptr addrspace(1) %
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_max_u32 s6, s0, s1
; GCN-NEXT: s_cmp_le_u32 s0, s1
@@ -531,6 +582,9 @@ define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(ptr addrspace(1) %
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x2
; GCN-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s3, s0, s3
; GCN-NEXT: s_sub_i32 s6, s1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
index 884ba3fc34dff..29448ab2d822e 100644
--- a/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/scalar_to_vector.v8i16.ll
@@ -9,6 +9,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v5, s3
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -24,6 +26,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX906-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_mov_b32_e32 v5, s3
; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -39,6 +43,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX908: ; %bb.0: ; %entry
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX908-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, s3
; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -55,6 +61,8 @@ define amdgpu_kernel void @scalar_to_vector_v8i16(<2 x i32> %in, ptr %out) #0 {
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, s3
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -88,6 +96,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX900: ; %bb.0: ; %entry
; GFX900-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX900-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX900-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX900-NEXT: s_waitcnt lgkmcnt(0)
; GFX900-NEXT: v_mov_b32_e32 v5, s3
; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -103,6 +113,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX906: ; %bb.0: ; %entry
; GFX906-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX906-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX906-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX906-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX906-NEXT: s_waitcnt lgkmcnt(0)
; GFX906-NEXT: v_mov_b32_e32 v5, s3
; GFX906-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -118,6 +130,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX908: ; %bb.0: ; %entry
; GFX908-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX908-NEXT: v_lshlrev_b32_e32 v4, 4, v0
+; GFX908-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX908-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
; GFX908-NEXT: v_mov_b32_e32 v5, s3
; GFX908-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
@@ -134,6 +148,8 @@ define amdgpu_kernel void @scalar_to_vector_v8f16(<2 x float> %in, ptr %out) #0
; GFX90A-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; GFX90A-NEXT: v_and_b32_e32 v4, 0x3ff, v0
; GFX90A-NEXT: v_lshlrev_b32_e32 v4, 4, v4
+; GFX90A-NEXT: s_add_u32 flat_scratch_lo, s12, s17
+; GFX90A-NEXT: s_addc_u32 flat_scratch_hi, s13, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v5, s3
; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, s2, v4
diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
index 0ad10437299f4..90dfd5a21d107 100644
--- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll
@@ -20,179 +20,183 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: ; def s[2:3]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ; implicit-def: $vgpr22 : SGPR spill to VGPR lane
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[4:7]
-; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
; CHECK-NEXT: v_writelane_b32 v22, s2, 0
; CHECK-NEXT: v_writelane_b32 v22, s3, 1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[48:51]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[4:11]
+; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_writelane_b32 v22, s4, 2
; CHECK-NEXT: v_writelane_b32 v22, s5, 3
; CHECK-NEXT: v_writelane_b32 v22, s6, 4
-; CHECK-NEXT: s_load_dword s0, s[8:9], 0x8
; CHECK-NEXT: v_writelane_b32 v22, s7, 5
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[4:11]
-; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s4, 6
-; CHECK-NEXT: v_writelane_b32 v22, s5, 7
-; CHECK-NEXT: v_writelane_b32 v22, s6, 8
-; CHECK-NEXT: v_writelane_b32 v22, s7, 9
-; CHECK-NEXT: v_writelane_b32 v22, s8, 10
-; CHECK-NEXT: v_writelane_b32 v22, s9, 11
-; CHECK-NEXT: v_writelane_b32 v22, s10, 12
-; CHECK-NEXT: v_writelane_b32 v22, s11, 13
+; CHECK-NEXT: v_writelane_b32 v22, s8, 6
+; CHECK-NEXT: v_writelane_b32 v22, s9, 7
+; CHECK-NEXT: v_writelane_b32 v22, s10, 8
+; CHECK-NEXT: v_writelane_b32 v22, s11, 9
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[4:19]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s4, 14
-; CHECK-NEXT: v_writelane_b32 v22, s5, 15
-; CHECK-NEXT: v_writelane_b32 v22, s6, 16
-; CHECK-NEXT: v_writelane_b32 v22, s7, 17
-; CHECK-NEXT: v_writelane_b32 v22, s8, 18
-; CHECK-NEXT: v_writelane_b32 v22, s9, 19
-; CHECK-NEXT: v_writelane_b32 v22, s10, 20
-; CHECK-NEXT: v_writelane_b32 v22, s11, 21
-; CHECK-NEXT: v_writelane_b32 v22, s12, 22
-; CHECK-NEXT: v_writelane_b32 v22, s13, 23
-; CHECK-NEXT: v_writelane_b32 v22, s14, 24
-; CHECK-NEXT: v_writelane_b32 v22, s15, 25
-; CHECK-NEXT: v_writelane_b32 v22, s16, 26
-; CHECK-NEXT: v_writelane_b32 v22, s17, 27
-; CHECK-NEXT: v_writelane_b32 v22, s18, 28
-; CHECK-NEXT: v_writelane_b32 v22, s19, 29
+; CHECK-NEXT: v_writelane_b32 v22, s4, 10
+; CHECK-NEXT: v_writelane_b32 v22, s5, 11
+; CHECK-NEXT: v_writelane_b32 v22, s6, 12
+; CHECK-NEXT: v_writelane_b32 v22, s7, 13
+; CHECK-NEXT: v_writelane_b32 v22, s8, 14
+; CHECK-NEXT: v_writelane_b32 v22, s9, 15
+; CHECK-NEXT: v_writelane_b32 v22, s10, 16
+; CHECK-NEXT: v_writelane_b32 v22, s11, 17
+; CHECK-NEXT: v_writelane_b32 v22, s12, 18
+; CHECK-NEXT: v_writelane_b32 v22, s13, 19
+; CHECK-NEXT: v_writelane_b32 v22, s14, 20
+; CHECK-NEXT: v_writelane_b32 v22, s15, 21
+; CHECK-NEXT: v_writelane_b32 v22, s16, 22
+; CHECK-NEXT: v_writelane_b32 v22, s17, 23
+; CHECK-NEXT: v_writelane_b32 v22, s18, 24
+; CHECK-NEXT: v_writelane_b32 v22, s19, 25
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[42:43]
+; CHECK-NEXT: ; def s[38:39]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[52:55]
+; CHECK-NEXT: ; def s[44:47]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[4:11]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s4, 30
-; CHECK-NEXT: v_writelane_b32 v22, s5, 31
-; CHECK-NEXT: v_writelane_b32 v22, s6, 32
-; CHECK-NEXT: v_writelane_b32 v22, s7, 33
-; CHECK-NEXT: v_writelane_b32 v22, s8, 34
-; CHECK-NEXT: v_writelane_b32 v22, s9, 35
-; CHECK-NEXT: v_writelane_b32 v22, s10, 36
-; CHECK-NEXT: v_writelane_b32 v22, s11, 37
+; CHECK-NEXT: v_writelane_b32 v22, s4, 26
+; CHECK-NEXT: v_writelane_b32 v22, s5, 27
+; CHECK-NEXT: v_writelane_b32 v22, s6, 28
+; CHECK-NEXT: v_writelane_b32 v22, s7, 29
+; CHECK-NEXT: v_writelane_b32 v22, s8, 30
+; CHECK-NEXT: v_writelane_b32 v22, s9, 31
+; CHECK-NEXT: v_writelane_b32 v22, s10, 32
+; CHECK-NEXT: v_writelane_b32 v22, s11, 33
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_cmp_lg_u32 s0, 0
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[16:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[40:41]
+; CHECK-NEXT: ; def s[36:37]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[36:39]
+; CHECK-NEXT: ; def s[40:43]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[44:51]
+; CHECK-NEXT: ; def s[0:7]
; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_writelane_b32 v22, s0, 34
+; CHECK-NEXT: v_writelane_b32 v22, s1, 35
+; CHECK-NEXT: v_writelane_b32 v22, s2, 36
+; CHECK-NEXT: v_writelane_b32 v22, s3, 37
+; CHECK-NEXT: v_writelane_b32 v22, s4, 38
+; CHECK-NEXT: v_writelane_b32 v22, s5, 39
+; CHECK-NEXT: v_writelane_b32 v22, s6, 40
+; CHECK-NEXT: v_writelane_b32 v22, s7, 41
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s0, 38
-; CHECK-NEXT: v_writelane_b32 v22, s1, 39
-; CHECK-NEXT: v_writelane_b32 v22, s2, 40
-; CHECK-NEXT: v_writelane_b32 v22, s3, 41
-; CHECK-NEXT: v_writelane_b32 v22, s4, 42
-; CHECK-NEXT: v_writelane_b32 v22, s5, 43
-; CHECK-NEXT: v_writelane_b32 v22, s6, 44
-; CHECK-NEXT: v_writelane_b32 v22, s7, 45
-; CHECK-NEXT: v_writelane_b32 v22, s8, 46
-; CHECK-NEXT: v_writelane_b32 v22, s9, 47
-; CHECK-NEXT: v_writelane_b32 v22, s10, 48
-; CHECK-NEXT: v_writelane_b32 v22, s11, 49
-; CHECK-NEXT: v_writelane_b32 v22, s12, 50
-; CHECK-NEXT: v_writelane_b32 v22, s13, 51
-; CHECK-NEXT: v_writelane_b32 v22, s14, 52
-; CHECK-NEXT: v_writelane_b32 v22, s15, 53
+; CHECK-NEXT: v_writelane_b32 v22, s0, 42
+; CHECK-NEXT: v_writelane_b32 v22, s1, 43
+; CHECK-NEXT: v_writelane_b32 v22, s2, 44
+; CHECK-NEXT: v_writelane_b32 v22, s3, 45
+; CHECK-NEXT: v_writelane_b32 v22, s4, 46
+; CHECK-NEXT: v_writelane_b32 v22, s5, 47
+; CHECK-NEXT: v_writelane_b32 v22, s6, 48
+; CHECK-NEXT: v_writelane_b32 v22, s7, 49
+; CHECK-NEXT: v_writelane_b32 v22, s8, 50
+; CHECK-NEXT: v_writelane_b32 v22, s9, 51
+; CHECK-NEXT: v_writelane_b32 v22, s10, 52
+; CHECK-NEXT: v_writelane_b32 v22, s11, 53
+; CHECK-NEXT: v_writelane_b32 v22, s12, 54
+; CHECK-NEXT: v_writelane_b32 v22, s13, 55
+; CHECK-NEXT: v_writelane_b32 v22, s14, 56
+; CHECK-NEXT: v_writelane_b32 v22, s15, 57
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[34:35]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v22, s0, 54
-; CHECK-NEXT: v_writelane_b32 v22, s1, 55
-; CHECK-NEXT: v_writelane_b32 v22, s2, 56
-; CHECK-NEXT: v_writelane_b32 v22, s3, 57
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[0:7]
-; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_writelane_b32 v22, s0, 58
; CHECK-NEXT: v_writelane_b32 v22, s1, 59
; CHECK-NEXT: v_writelane_b32 v22, s2, 60
-; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
; CHECK-NEXT: v_writelane_b32 v22, s3, 61
-; CHECK-NEXT: v_writelane_b32 v22, s4, 62
-; CHECK-NEXT: v_writelane_b32 v23, s6, 0
-; CHECK-NEXT: v_writelane_b32 v22, s5, 63
-; CHECK-NEXT: v_writelane_b32 v23, s7, 1
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[0:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: ; implicit-def: $vgpr23 : SGPR spill to VGPR lane
+; CHECK-NEXT: v_writelane_b32 v22, s0, 62
+; CHECK-NEXT: v_writelane_b32 v23, s2, 0
+; CHECK-NEXT: v_writelane_b32 v23, s3, 1
+; CHECK-NEXT: v_writelane_b32 v23, s4, 2
+; CHECK-NEXT: v_writelane_b32 v23, s5, 3
+; CHECK-NEXT: v_writelane_b32 v23, s6, 4
+; CHECK-NEXT: v_writelane_b32 v22, s1, 63
+; CHECK-NEXT: v_writelane_b32 v23, s7, 5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 2
-; CHECK-NEXT: v_writelane_b32 v23, s1, 3
-; CHECK-NEXT: v_writelane_b32 v23, s2, 4
-; CHECK-NEXT: v_writelane_b32 v23, s3, 5
-; CHECK-NEXT: v_writelane_b32 v23, s4, 6
-; CHECK-NEXT: v_writelane_b32 v23, s5, 7
-; CHECK-NEXT: v_writelane_b32 v23, s6, 8
-; CHECK-NEXT: v_writelane_b32 v23, s7, 9
-; CHECK-NEXT: v_writelane_b32 v23, s8, 10
-; CHECK-NEXT: v_writelane_b32 v23, s9, 11
-; CHECK-NEXT: v_writelane_b32 v23, s10, 12
-; CHECK-NEXT: v_writelane_b32 v23, s11, 13
-; CHECK-NEXT: v_writelane_b32 v23, s12, 14
-; CHECK-NEXT: v_writelane_b32 v23, s13, 15
-; CHECK-NEXT: v_writelane_b32 v23, s14, 16
-; CHECK-NEXT: v_writelane_b32 v23, s15, 17
+; CHECK-NEXT: v_writelane_b32 v23, s0, 6
+; CHECK-NEXT: v_writelane_b32 v23, s1, 7
+; CHECK-NEXT: v_writelane_b32 v23, s2, 8
+; CHECK-NEXT: v_writelane_b32 v23, s3, 9
+; CHECK-NEXT: v_writelane_b32 v23, s4, 10
+; CHECK-NEXT: v_writelane_b32 v23, s5, 11
+; CHECK-NEXT: v_writelane_b32 v23, s6, 12
+; CHECK-NEXT: v_writelane_b32 v23, s7, 13
+; CHECK-NEXT: v_writelane_b32 v23, s8, 14
+; CHECK-NEXT: v_writelane_b32 v23, s9, 15
+; CHECK-NEXT: v_writelane_b32 v23, s10, 16
+; CHECK-NEXT: v_writelane_b32 v23, s11, 17
+; CHECK-NEXT: v_writelane_b32 v23, s12, 18
+; CHECK-NEXT: v_writelane_b32 v23, s13, 19
+; CHECK-NEXT: v_writelane_b32 v23, s14, 20
+; CHECK-NEXT: v_writelane_b32 v23, s15, 21
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 18
-; CHECK-NEXT: v_writelane_b32 v23, s1, 19
+; CHECK-NEXT: v_writelane_b32 v23, s0, 22
+; CHECK-NEXT: v_writelane_b32 v23, s1, 23
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 20
-; CHECK-NEXT: v_writelane_b32 v23, s1, 21
-; CHECK-NEXT: v_writelane_b32 v23, s2, 22
-; CHECK-NEXT: v_writelane_b32 v23, s3, 23
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; def s[0:7]
-; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_writelane_b32 v23, s0, 24
; CHECK-NEXT: v_writelane_b32 v23, s1, 25
; CHECK-NEXT: v_writelane_b32 v23, s2, 26
; CHECK-NEXT: v_writelane_b32 v23, s3, 27
-; CHECK-NEXT: v_writelane_b32 v23, s4, 28
-; CHECK-NEXT: v_writelane_b32 v23, s5, 29
-; CHECK-NEXT: v_writelane_b32 v23, s6, 30
-; CHECK-NEXT: v_writelane_b32 v23, s7, 31
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; def s[0:7]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_writelane_b32 v23, s0, 28
+; CHECK-NEXT: v_writelane_b32 v23, s1, 29
+; CHECK-NEXT: v_writelane_b32 v23, s2, 30
+; CHECK-NEXT: v_writelane_b32 v23, s3, 31
+; CHECK-NEXT: v_writelane_b32 v23, s4, 32
+; CHECK-NEXT: v_writelane_b32 v23, s5, 33
+; CHECK-NEXT: v_writelane_b32 v23, s6, 34
+; CHECK-NEXT: v_writelane_b32 v23, s7, 35
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; def s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_writelane_b32 v23, s0, 32
-; CHECK-NEXT: v_writelane_b32 v23, s1, 33
-; CHECK-NEXT: v_writelane_b32 v23, s2, 34
-; CHECK-NEXT: v_writelane_b32 v23, s3, 35
-; CHECK-NEXT: v_writelane_b32 v23, s4, 36
-; CHECK-NEXT: v_writelane_b32 v23, s5, 37
-; CHECK-NEXT: v_writelane_b32 v23, s6, 38
-; CHECK-NEXT: v_writelane_b32 v23, s7, 39
-; CHECK-NEXT: v_writelane_b32 v23, s8, 40
-; CHECK-NEXT: v_writelane_b32 v23, s9, 41
-; CHECK-NEXT: v_writelane_b32 v23, s10, 42
-; CHECK-NEXT: v_writelane_b32 v23, s11, 43
-; CHECK-NEXT: v_writelane_b32 v23, s12, 44
-; CHECK-NEXT: v_writelane_b32 v23, s13, 45
-; CHECK-NEXT: v_writelane_b32 v23, s14, 46
-; CHECK-NEXT: v_writelane_b32 v23, s15, 47
+; CHECK-NEXT: v_writelane_b32 v23, s0, 36
+; CHECK-NEXT: v_writelane_b32 v23, s1, 37
+; CHECK-NEXT: v_writelane_b32 v23, s2, 38
+; CHECK-NEXT: v_writelane_b32 v23, s3, 39
+; CHECK-NEXT: v_writelane_b32 v23, s4, 40
+; CHECK-NEXT: v_writelane_b32 v23, s5, 41
+; CHECK-NEXT: v_writelane_b32 v23, s6, 42
+; CHECK-NEXT: v_writelane_b32 v23, s7, 43
+; CHECK-NEXT: v_writelane_b32 v23, s8, 44
+; CHECK-NEXT: v_writelane_b32 v23, s9, 45
+; CHECK-NEXT: v_writelane_b32 v23, s10, 46
+; CHECK-NEXT: v_writelane_b32 v23, s11, 47
+; CHECK-NEXT: v_writelane_b32 v23, s12, 48
+; CHECK-NEXT: v_writelane_b32 v23, s13, 49
+; CHECK-NEXT: v_writelane_b32 v23, s14, 50
+; CHECK-NEXT: v_writelane_b32 v23, s15, 51
; CHECK-NEXT: s_cbranch_scc0 .LBB0_2
; CHECK-NEXT: ; %bb.1: ; %ret
; CHECK-NEXT: s_endpgm
@@ -206,166 +210,170 @@ define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 {
; CHECK-NEXT: v_readlane_b32 s1, v22, 3
; CHECK-NEXT: v_readlane_b32 s2, v22, 4
; CHECK-NEXT: v_readlane_b32 s3, v22, 5
+; CHECK-NEXT: v_readlane_b32 s4, v22, 6
+; CHECK-NEXT: v_readlane_b32 s5, v22, 7
+; CHECK-NEXT: v_readlane_b32 s6, v22, 8
+; CHECK-NEXT: v_readlane_b32 s7, v22, 9
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[0:3]
+; CHECK-NEXT: ; use s[48:51]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 6
-; CHECK-NEXT: v_readlane_b32 s1, v22, 7
-; CHECK-NEXT: v_readlane_b32 s2, v22, 8
-; CHECK-NEXT: v_readlane_b32 s3, v22, 9
-; CHECK-NEXT: v_readlane_b32 s4, v22, 10
-; CHECK-NEXT: v_readlane_b32 s5, v22, 11
-; CHECK-NEXT: v_readlane_b32 s6, v22, 12
-; CHECK-NEXT: v_readlane_b32 s7, v22, 13
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 14
-; CHECK-NEXT: v_readlane_b32 s1, v22, 15
-; CHECK-NEXT: v_readlane_b32 s2, v22, 16
-; CHECK-NEXT: v_readlane_b32 s3, v22, 17
-; CHECK-NEXT: v_readlane_b32 s4, v22, 18
-; CHECK-NEXT: v_readlane_b32 s5, v22, 19
-; CHECK-NEXT: v_readlane_b32 s6, v22, 20
-; CHECK-NEXT: v_readlane_b32 s7, v22, 21
-; CHECK-NEXT: v_readlane_b32 s8, v22, 22
-; CHECK-NEXT: v_readlane_b32 s9, v22, 23
-; CHECK-NEXT: v_readlane_b32 s10, v22, 24
-; CHECK-NEXT: v_readlane_b32 s11, v22, 25
-; CHECK-NEXT: v_readlane_b32 s12, v22, 26
-; CHECK-NEXT: v_readlane_b32 s13, v22, 27
-; CHECK-NEXT: v_readlane_b32 s14, v22, 28
-; CHECK-NEXT: v_readlane_b32 s15, v22, 29
+; CHECK-NEXT: v_readlane_b32 s0, v22, 10
+; CHECK-NEXT: v_readlane_b32 s1, v22, 11
+; CHECK-NEXT: v_readlane_b32 s2, v22, 12
+; CHECK-NEXT: v_readlane_b32 s3, v22, 13
+; CHECK-NEXT: v_readlane_b32 s4, v22, 14
+; CHECK-NEXT: v_readlane_b32 s5, v22, 15
+; CHECK-NEXT: v_readlane_b32 s6, v22, 16
+; CHECK-NEXT: v_readlane_b32 s7, v22, 17
+; CHECK-NEXT: v_readlane_b32 s8, v22, 18
+; CHECK-NEXT: v_readlane_b32 s9, v22, 19
+; CHECK-NEXT: v_readlane_b32 s10, v22, 20
+; CHECK-NEXT: v_readlane_b32 s11, v22, 21
+; CHECK-NEXT: v_readlane_b32 s12, v22, 22
+; CHECK-NEXT: v_readlane_b32 s13, v22, 23
+; CHECK-NEXT: v_readlane_b32 s14, v22, 24
+; CHECK-NEXT: v_readlane_b32 s15, v22, 25
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 30
-; CHECK-NEXT: v_readlane_b32 s1, v22, 31
-; CHECK-NEXT: v_readlane_b32 s2, v22, 32
-; CHECK-NEXT: v_readlane_b32 s3, v22, 33
-; CHECK-NEXT: v_readlane_b32 s4, v22, 34
-; CHECK-NEXT: v_readlane_b32 s5, v22, 35
-; CHECK-NEXT: v_readlane_b32 s6, v22, 36
-; CHECK-NEXT: v_readlane_b32 s7, v22, 37
+; CHECK-NEXT: v_readlane_b32 s0, v22, 26
+; CHECK-NEXT: v_readlane_b32 s1, v22, 27
+; CHECK-NEXT: v_readlane_b32 s2, v22, 28
+; CHECK-NEXT: v_readlane_b32 s3, v22, 29
+; CHECK-NEXT: v_readlane_b32 s4, v22, 30
+; CHECK-NEXT: v_readlane_b32 s5, v22, 31
+; CHECK-NEXT: v_readlane_b32 s6, v22, 32
+; CHECK-NEXT: v_readlane_b32 s7, v22, 33
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[42:43]
+; CHECK-NEXT: ; use s[38:39]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[52:55]
+; CHECK-NEXT: ; use s[44:47]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 38
-; CHECK-NEXT: v_readlane_b32 s1, v22, 39
-; CHECK-NEXT: v_readlane_b32 s2, v22, 40
-; CHECK-NEXT: v_readlane_b32 s3, v22, 41
+; CHECK-NEXT: v_readlane_b32 s0, v22, 34
+; CHECK-NEXT: v_readlane_b32 s1, v22, 35
+; CHECK-NEXT: v_readlane_b32 s2, v22, 36
+; CHECK-NEXT: v_readlane_b32 s3, v22, 37
+; CHECK-NEXT: v_readlane_b32 s4, v22, 38
+; CHECK-NEXT: v_readlane_b32 s5, v22, 39
+; CHECK-NEXT: v_readlane_b32 s6, v22, 40
+; CHECK-NEXT: v_readlane_b32 s7, v22, 41
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[16:31]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[40:41]
+; CHECK-NEXT: ; use s[36:37]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[36:39]
+; CHECK-NEXT: ; use s[40:43]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[44:51]
+; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s4, v22, 42
-; CHECK-NEXT: v_readlane_b32 s5, v22, 43
-; CHECK-NEXT: v_readlane_b32 s6, v22, 44
-; CHECK-NEXT: v_readlane_b32 s7, v22, 45
-; CHECK-NEXT: v_readlane_b32 s8, v22, 46
-; CHECK-NEXT: v_readlane_b32 s9, v22, 47
-; CHECK-NEXT: v_readlane_b32 s10, v22, 48
-; CHECK-NEXT: v_readlane_b32 s11, v22, 49
-; CHECK-NEXT: v_readlane_b32 s12, v22, 50
-; CHECK-NEXT: v_readlane_b32 s13, v22, 51
-; CHECK-NEXT: v_readlane_b32 s14, v22, 52
-; CHECK-NEXT: v_readlane_b32 s15, v22, 53
+; CHECK-NEXT: v_readlane_b32 s0, v22, 42
+; CHECK-NEXT: v_readlane_b32 s1, v22, 43
+; CHECK-NEXT: v_readlane_b32 s2, v22, 44
+; CHECK-NEXT: v_readlane_b32 s3, v22, 45
+; CHECK-NEXT: v_readlane_b32 s4, v22, 46
+; CHECK-NEXT: v_readlane_b32 s5, v22, 47
+; CHECK-NEXT: v_readlane_b32 s6, v22, 48
+; CHECK-NEXT: v_readlane_b32 s7, v22, 49
+; CHECK-NEXT: v_readlane_b32 s8, v22, 50
+; CHECK-NEXT: v_readlane_b32 s9, v22, 51
+; CHECK-NEXT: v_readlane_b32 s10, v22, 52
+; CHECK-NEXT: v_readlane_b32 s11, v22, 53
+; CHECK-NEXT: v_readlane_b32 s12, v22, 54
+; CHECK-NEXT: v_readlane_b32 s13, v22, 55
+; CHECK-NEXT: v_readlane_b32 s14, v22, 56
+; CHECK-NEXT: v_readlane_b32 s15, v22, 57
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 54
-; CHECK-NEXT: v_readlane_b32 s1, v22, 55
-; CHECK-NEXT: v_readlane_b32 s2, v22, 56
-; CHECK-NEXT: v_readlane_b32 s3, v22, 57
+; CHECK-NEXT: v_readlane_b32 s0, v22, 58
+; CHECK-NEXT: v_readlane_b32 s1, v22, 59
+; CHECK-NEXT: v_readlane_b32 s2, v22, 60
+; CHECK-NEXT: v_readlane_b32 s3, v22, 61
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[34:35]
; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:3]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v22, 58
-; CHECK-NEXT: v_readlane_b32 s1, v22, 59
-; CHECK-NEXT: v_readlane_b32 s2, v22, 60
-; CHECK-NEXT: v_readlane_b32 s3, v22, 61
-; CHECK-NEXT: v_readlane_b32 s4, v22, 62
-; CHECK-NEXT: v_readlane_b32 s5, v22, 63
-; CHECK-NEXT: v_readlane_b32 s6, v23, 0
-; CHECK-NEXT: v_readlane_b32 s7, v23, 1
+; CHECK-NEXT: v_readlane_b32 s0, v22, 62
+; CHECK-NEXT: v_readlane_b32 s1, v22, 63
+; CHECK-NEXT: v_readlane_b32 s2, v23, 0
+; CHECK-NEXT: v_readlane_b32 s3, v23, 1
+; CHECK-NEXT: v_readlane_b32 s4, v23, 2
+; CHECK-NEXT: v_readlane_b32 s5, v23, 3
+; CHECK-NEXT: v_readlane_b32 s6, v23, 4
+; CHECK-NEXT: v_readlane_b32 s7, v23, 5
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 2
-; CHECK-NEXT: v_readlane_b32 s1, v23, 3
-; CHECK-NEXT: v_readlane_b32 s2, v23, 4
-; CHECK-NEXT: v_readlane_b32 s3, v23, 5
-; CHECK-NEXT: v_readlane_b32 s4, v23, 6
-; CHECK-NEXT: v_readlane_b32 s5, v23, 7
-; CHECK-NEXT: v_readlane_b32 s6, v23, 8
-; CHECK-NEXT: v_readlane_b32 s7, v23, 9
-; CHECK-NEXT: v_readlane_b32 s8, v23, 10
-; CHECK-NEXT: v_readlane_b32 s9, v23, 11
-; CHECK-NEXT: v_readlane_b32 s10, v23, 12
-; CHECK-NEXT: v_readlane_b32 s11, v23, 13
-; CHECK-NEXT: v_readlane_b32 s12, v23, 14
-; CHECK-NEXT: v_readlane_b32 s13, v23, 15
-; CHECK-NEXT: v_readlane_b32 s14, v23, 16
-; CHECK-NEXT: v_readlane_b32 s15, v23, 17
+; CHECK-NEXT: v_readlane_b32 s0, v23, 6
+; CHECK-NEXT: v_readlane_b32 s1, v23, 7
+; CHECK-NEXT: v_readlane_b32 s2, v23, 8
+; CHECK-NEXT: v_readlane_b32 s3, v23, 9
+; CHECK-NEXT: v_readlane_b32 s4, v23, 10
+; CHECK-NEXT: v_readlane_b32 s5, v23, 11
+; CHECK-NEXT: v_readlane_b32 s6, v23, 12
+; CHECK-NEXT: v_readlane_b32 s7, v23, 13
+; CHECK-NEXT: v_readlane_b32 s8, v23, 14
+; CHECK-NEXT: v_readlane_b32 s9, v23, 15
+; CHECK-NEXT: v_readlane_b32 s10, v23, 16
+; CHECK-NEXT: v_readlane_b32 s11, v23, 17
+; CHECK-NEXT: v_readlane_b32 s12, v23, 18
+; CHECK-NEXT: v_readlane_b32 s13, v23, 19
+; CHECK-NEXT: v_readlane_b32 s14, v23, 20
+; CHECK-NEXT: v_readlane_b32 s15, v23, 21
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 18
-; CHECK-NEXT: v_readlane_b32 s1, v23, 19
+; CHECK-NEXT: v_readlane_b32 s0, v23, 22
+; CHECK-NEXT: v_readlane_b32 s1, v23, 23
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:1]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 20
-; CHECK-NEXT: v_readlane_b32 s1, v23, 21
-; CHECK-NEXT: v_readlane_b32 s2, v23, 22
-; CHECK-NEXT: v_readlane_b32 s3, v23, 23
-; CHECK-NEXT: ;;#ASMSTART
-; CHECK-NEXT: ; use s[0:3]
-; CHECK-NEXT: ;;#ASMEND
; CHECK-NEXT: v_readlane_b32 s0, v23, 24
; CHECK-NEXT: v_readlane_b32 s1, v23, 25
; CHECK-NEXT: v_readlane_b32 s2, v23, 26
; CHECK-NEXT: v_readlane_b32 s3, v23, 27
-; CHECK-NEXT: v_readlane_b32 s4, v23, 28
-; CHECK-NEXT: v_readlane_b32 s5, v23, 29
-; CHECK-NEXT: v_readlane_b32 s6, v23, 30
-; CHECK-NEXT: v_readlane_b32 s7, v23, 31
+; CHECK-NEXT: ;;#ASMSTART
+; CHECK-NEXT: ; use s[0:3]
+; CHECK-NEXT: ;;#ASMEND
+; CHECK-NEXT: v_readlane_b32 s0, v23, 28
+; CHECK-NEXT: v_readlane_b32 s1, v23, 29
+; CHECK-NEXT: v_readlane_b32 s2, v23, 30
+; CHECK-NEXT: v_readlane_b32 s3, v23, 31
+; CHECK-NEXT: v_readlane_b32 s4, v23, 32
+; CHECK-NEXT: v_readlane_b32 s5, v23, 33
+; CHECK-NEXT: v_readlane_b32 s6, v23, 34
+; CHECK-NEXT: v_readlane_b32 s7, v23, 35
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:7]
; CHECK-NEXT: ;;#ASMEND
-; CHECK-NEXT: v_readlane_b32 s0, v23, 32
-; CHECK-NEXT: v_readlane_b32 s1, v23, 33
-; CHECK-NEXT: v_readlane_b32 s2, v23, 34
-; CHECK-NEXT: v_readlane_b32 s3, v23, 35
-; CHECK-NEXT: v_readlane_b32 s4, v23, 36
-; CHECK-NEXT: v_readlane_b32 s5, v23, 37
-; CHECK-NEXT: v_readlane_b32 s6, v23, 38
-; CHECK-NEXT: v_readlane_b32 s7, v23, 39
-; CHECK-NEXT: v_readlane_b32 s8, v23, 40
-; CHECK-NEXT: v_readlane_b32 s9, v23, 41
-; CHECK-NEXT: v_readlane_b32 s10, v23, 42
-; CHECK-NEXT: v_readlane_b32 s11, v23, 43
-; CHECK-NEXT: v_readlane_b32 s12, v23, 44
-; CHECK-NEXT: v_readlane_b32 s13, v23, 45
-; CHECK-NEXT: v_readlane_b32 s14, v23, 46
-; CHECK-NEXT: v_readlane_b32 s15, v23, 47
+; CHECK-NEXT: v_readlane_b32 s0, v23, 36
+; CHECK-NEXT: v_readlane_b32 s1, v23, 37
+; CHECK-NEXT: v_readlane_b32 s2, v23, 38
+; CHECK-NEXT: v_readlane_b32 s3, v23, 39
+; CHECK-NEXT: v_readlane_b32 s4, v23, 40
+; CHECK-NEXT: v_readlane_b32 s5, v23, 41
+; CHECK-NEXT: v_readlane_b32 s6, v23, 42
+; CHECK-NEXT: v_readlane_b32 s7, v23, 43
+; CHECK-NEXT: v_readlane_b32 s8, v23, 44
+; CHECK-NEXT: v_readlane_b32 s9, v23, 45
+; CHECK-NEXT: v_readlane_b32 s10, v23, 46
+; CHECK-NEXT: v_readlane_b32 s11, v23, 47
+; CHECK-NEXT: v_readlane_b32 s12, v23, 48
+; CHECK-NEXT: v_readlane_b32 s13, v23, 49
+; CHECK-NEXT: v_readlane_b32 s14, v23, 50
+; CHECK-NEXT: v_readlane_b32 s15, v23, 51
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; use s[0:15]
; CHECK-NEXT: ;;#ASMEND
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
index 455d22f2aa29c..cdfba3cf0db7f 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll
@@ -7,7 +7,7 @@
define amdgpu_kernel void @partial_no_vgprs_last_sgpr_spill(ptr addrspace(1) %out, i32 %in) #1 {
; GCN-LABEL: partial_no_vgprs_last_sgpr_spill:
; GCN: ; %bb.0:
-; GCN-NEXT: s_add_u32 s0, s0, s15
+; GCN-NEXT: s_add_u32 s0, s0, s17
; GCN-NEXT: s_addc_u32 s1, s1, 0
; GCN-NEXT: s_load_dword s4, s[8:9], 0x2
; GCN-NEXT: ;;#ASMSTART
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index a423b6f831a9d..65a17ed67481c 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -182,8 +182,10 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-LABEL: s_shl_i128_ss:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s5, s4, 64
; GCN-NEXT: s_sub_i32 s12, 64, s4
@@ -203,6 +205,7 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-NEXT: v_mov_b32_e32 v0, s6
; GCN-NEXT: v_mov_b32_e32 v1, s5
; GCN-NEXT: v_mov_b32_e32 v2, s0
+; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v3, s1
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -215,8 +218,10 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-LABEL: s_lshr_i128_ss:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s5, s4, 64
; GCN-NEXT: s_sub_i32 s12, 64, s4
@@ -236,6 +241,7 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s6
+; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v3, s5
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -248,8 +254,10 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-LABEL: s_ashr_i128_ss:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: v_mov_b32_e32 v4, 0
-; GCN-NEXT: v_mov_b32_e32 v5, 0
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_sub_i32 s5, 64, s4
; GCN-NEXT: s_lshr_b64 s[6:7], s[0:1], s4
@@ -270,6 +278,7 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
; GCN-NEXT: v_mov_b32_e32 v2, s2
+; GCN-NEXT: v_mov_b32_e32 v5, 0
; GCN-NEXT: v_mov_b32_e32 v3, s3
; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3]
; GCN-NEXT: s_endpgm
@@ -430,6 +439,9 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: s_shl_v2i128ss:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v6, 16
; GCN-NEXT: v_mov_b32_e32 v4, 0
@@ -502,6 +514,9 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: s_lshr_v2i128_ss:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v6, 16
; GCN-NEXT: v_mov_b32_e32 v4, 0
@@ -574,6 +589,9 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
; GCN-LABEL: s_ashr_v2i128_ss:
; GCN: ; %bb.0:
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_load_dwordx16 s[0:15], s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v6, 16
; GCN-NEXT: v_mov_b32_e32 v4, 0
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index 59d7fe107ee53..b1605f19b2ed5 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -1,5 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -amdgpu-annotate-kernel-features %s | FileCheck -check-prefix=AKF_GCN %s
; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-attributor %s | FileCheck -check-prefix=ATTRIBUTOR_GCN %s
; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
@@ -7,9 +6,6 @@
target datalayout = "A5"
define internal void @indirect() {
-; AKF_GCN-LABEL: define {{[^@]+}}@indirect() {
-; AKF_GCN-NEXT: ret void
-;
; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@indirect
; ATTRIBUTOR_GCN-SAME: () #[[ATTR0:[0-9]+]] {
; ATTRIBUTOR_GCN-NEXT: ret void
@@ -22,15 +18,6 @@ define internal void @indirect() {
}
define amdgpu_kernel void @test_simple_indirect_call() {
-; AKF_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call
-; AKF_GCN-SAME: () #[[ATTR0:[0-9]+]] {
-; AKF_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
-; AKF_GCN-NEXT: [[FPTR_CAST:%.*]] = addrspacecast ptr addrspace(5) [[FPTR]] to ptr
-; AKF_GCN-NEXT: store ptr @indirect, ptr [[FPTR_CAST]], align 8
-; AKF_GCN-NEXT: [[FP:%.*]] = load ptr, ptr [[FPTR_CAST]], align 8
-; AKF_GCN-NEXT: call void [[FP]]()
-; AKF_GCN-NEXT: ret void
-;
; ATTRIBUTOR_GCN-LABEL: define {{[^@]+}}@test_simple_indirect_call
; ATTRIBUTOR_GCN-SAME: () #[[ATTR1:[0-9]+]] {
; ATTRIBUTOR_GCN-NEXT: [[FPTR:%.*]] = alloca ptr, align 8, addrspace(5)
@@ -79,12 +66,10 @@ define amdgpu_kernel void @test_simple_indirect_call() {
!llvm.module.flags = !{!0}
!0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
-; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" }
;.
; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-flat-scratch-init" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.
-; AKF_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
; ATTRIBUTOR_GCN: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 6b40df0345ebe..46f257eff1f24 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -9,6 +9,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2
; CI-NEXT: v_mov_b32_e32 v3, s1
@@ -20,6 +23,9 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -38,11 +44,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -52,11 +61,14 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -72,6 +84,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_bitcmp1_b32 s2, 0
; CI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -86,6 +101,9 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s2, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -104,6 +122,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
; CI-LABEL: s_sint_to_fp_i64_to_f64:
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3
; CI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -117,6 +138,9 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
; VI-LABEL: s_sint_to_fp_i64_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_i32_e32 v[0:1], s3
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -136,6 +160,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
; CI: ; %bb.0:
; CI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; CI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_mov_b32_e32 v1, s3
; CI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
@@ -155,6 +182,9 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -183,6 +213,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_sext_i32_i8 s2, s2
; CI-NEXT: v_cvt_f64_i32_e32 v[0:1], s2
@@ -195,6 +228,9 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bfe_i32 s2, s2, 0x80000
; VI-NEXT: s_sext_i32_i16 s2, s2
@@ -232,11 +268,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -246,11 +285,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -283,11 +325,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -297,11 +342,14 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0xbff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -353,11 +401,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
; CI: ; %bb.0:
; CI-NEXT: s_load_dword s2, s[8:9], 0x2
; CI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; CI-NEXT: v_mov_b32_e32 v0, 0
+; CI-NEXT: s_add_i32 s12, s12, s17
+; CI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; CI-NEXT: s_mov_b32 flat_scratch_lo, s13
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_cmp_eq_u32 s2, 0
; CI-NEXT: s_cselect_b32 s2, 0, 0xbff00000
; CI-NEXT: v_mov_b32_e32 v3, s1
+; CI-NEXT: v_mov_b32_e32 v0, 0
; CI-NEXT: v_mov_b32_e32 v1, s2
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -367,11 +418,14 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0, 0xbff00000
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
index 03e8e28ef54db..c1c3d29ecbdbe 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill-vector-superclass.ll
@@ -12,10 +12,10 @@ define amdgpu_kernel void @test_spill_av_class(<4 x i32> %arg) #0 {
; GCN-NEXT: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
; GCN-NEXT: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 2, implicit $exec
; GCN-NEXT: [[V_MFMA_I32_4X4X4I8_e64_:%[0-9]+]]:areg_128 = V_MFMA_I32_4X4X4I8_e64 [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
- ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %13.sub0
+ ; GCN-NEXT: INLINEASM &"; def $0", 1 /* sideeffect attdialect */, 2228234 /* regdef:VGPR_32 */, def undef %14.sub0
; GCN-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_I32_4X4X4I8_e64_]]
- ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %23:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
- ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %13
+ ; GCN-NEXT: GLOBAL_STORE_DWORDX4 undef %24:vreg_64, [[COPY1]], 0, 0, implicit $exec :: (volatile store (s128) into `ptr addrspace(1) undef`, addrspace 1)
+ ; GCN-NEXT: INLINEASM &"; use $0", 1 /* sideeffect attdialect */, 3538953 /* reguse:VReg_64 */, %14
; GCN-NEXT: S_ENDPGM 0
%v0 = call i32 asm sideeffect "; def $0", "=v"()
%tmp = insertelement <2 x i32> undef, i32 %v0, i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index f791135d45e9a..ef92cf3214e7f 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -50,7 +50,10 @@ define void @local_store_i56(ptr addrspace(3) %ptr, i56 %arg) #0 {
define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
; HAWAII-LABEL: local_store_i55:
; HAWAII: ; %bb.0:
+; HAWAII-NEXT: s_add_i32 s12, s12, s17
; HAWAII-NEXT: s_or_b32 s0, s8, 14
+; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
; HAWAII-NEXT: v_mov_b32_e32 v1, s9
; HAWAII-NEXT: flat_load_ubyte v0, v[0:1]
@@ -70,7 +73,10 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
;
; FIJI-LABEL: local_store_i55:
; FIJI: ; %bb.0:
+; FIJI-NEXT: s_add_i32 s12, s12, s17
; FIJI-NEXT: s_or_b32 s0, s8, 14
+; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; FIJI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; FIJI-NEXT: v_mov_b32_e32 v0, s0
; FIJI-NEXT: v_mov_b32_e32 v1, s9
; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
index 19d633651fdd0..30accc846d2b6 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-any.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
-; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @...............
+; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 9
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
index 2097579e0c995..4f84b31f1877b 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-off.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
-; OBJ-NEXT: 0030 0000af00 88000000 01000000 00000000 ................
+; OBJ-NEXT: 0030 0000af00 8c000000 21000000 00000000 ........!.......
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 5
diff --git a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
index 775c62e73261a..644f434923368 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-kd-xnack-on.ll
@@ -14,7 +14,7 @@ define amdgpu_kernel void @kern() #0 {
; OBJ-NEXT: 0000 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0010 00000000 00000000 00000000 00000000 ................
; OBJ-NEXT: 0020 00000000 00000000 00000000 00000000 ................
-; OBJ-NEXT: 0030 4000af00 88000000 01000000 00000000 @...............
+; OBJ-NEXT: 0030 4000af00 8c000000 21000000 00000000 @.......!.......
; ELF: AMDGPU Metadata
; ELF: .sgpr_count: 9
diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
index b8f0d7617167e..69cc63eba6243 100644
--- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll
+++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll
@@ -23,11 +23,14 @@ define amdgpu_kernel void @trap(ptr addrspace(1) nocapture readonly %arg0) {
; HSA-TRAP-GFX803-LABEL: trap:
; HSA-TRAP-GFX803: ; %bb.0:
; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1
-; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7]
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s2
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s3
+; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7]
; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2
; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
; HSA-TRAP-GFX803-NEXT: s_trap 2
@@ -121,6 +124,9 @@ define amdgpu_kernel void @non_entry_trap(ptr addrspace(1) nocapture readonly %a
; HSA-TRAP-GFX803-LABEL: non_entry_trap:
; HSA-TRAP-GFX803: ; %bb.0: ; %entry
; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1
@@ -280,6 +286,9 @@ define amdgpu_kernel void @trap_with_use_after(ptr addrspace(1) %arg0, ptr addrs
; HSA-TRAP-GFX803: ; %bb.0:
; HSA-TRAP-GFX803-NEXT: s_mov_b64 s[0:1], s[6:7]
; HSA-TRAP-GFX803-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s4
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s5
@@ -411,10 +420,13 @@ define amdgpu_kernel void @debugtrap(ptr addrspace(1) nocapture readonly %arg0)
; HSA-TRAP-GFX803-LABEL: debugtrap:
; HSA-TRAP-GFX803: ; %bb.0:
; HSA-TRAP-GFX803-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; HSA-TRAP-GFX803-NEXT: s_add_i32 s12, s12, s17
+; HSA-TRAP-GFX803-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HSA-TRAP-GFX803-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v2, 1
-; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2
; HSA-TRAP-GFX803-NEXT: s_waitcnt lgkmcnt(0)
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v0, s0
+; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v3, 2
; HSA-TRAP-GFX803-NEXT: v_mov_b32_e32 v1, s1
; HSA-TRAP-GFX803-NEXT: flat_store_dword v[0:1], v2
; HSA-TRAP-GFX803-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/udiv.ll b/llvm/test/CodeGen/AMDGPU/udiv.ll
index 95fcc45fe2458..a9a77abf1b0d1 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv.ll
@@ -81,6 +81,9 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
; GCN-LABEL: udiv_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -252,6 +255,9 @@ define amdgpu_kernel void @s_udiv_i32(ptr addrspace(1) %out, i32 %a, i32 %b) {
; GCN-LABEL: s_udiv_i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f32_u32_e32 v0, s3
; GCN-NEXT: s_sub_i32 s4, 0, s3
@@ -457,6 +463,9 @@ define amdgpu_kernel void @udiv_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: udiv_v2i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -810,6 +819,9 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: udiv_v4i32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s4, s2, 16
; GCN-NEXT: s_addc_u32 s5, s3, 0
@@ -1135,6 +1147,9 @@ define amdgpu_kernel void @udiv_i32_div_pow2(ptr addrspace(1) %out, ptr addrspac
; GCN-LABEL: udiv_i32_div_pow2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1224,6 +1239,9 @@ define amdgpu_kernel void @udiv_i32_div_k_even(ptr addrspace(1) %out, ptr addrsp
; GCN-LABEL: udiv_i32_div_k_even:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1318,6 +1336,9 @@ define amdgpu_kernel void @udiv_i32_div_k_odd(ptr addrspace(1) %out, ptr addrspa
; GCN-LABEL: udiv_i32_div_k_odd:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1430,6 +1451,9 @@ define amdgpu_kernel void @v_udiv_i8(ptr addrspace(1) %out, ptr addrspace(1) %in
; GCN-LABEL: v_udiv_i8:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1570,6 +1594,9 @@ define amdgpu_kernel void @v_udiv_i16(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: v_udiv_i16:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s2
; GCN-NEXT: v_mov_b32_e32 v1, s3
@@ -1726,6 +1753,9 @@ define amdgpu_kernel void @v_udiv_i23(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: v_udiv_i23:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s4, s2, 4
; GCN-NEXT: s_addc_u32 s5, s3, 0
@@ -1923,6 +1953,9 @@ define amdgpu_kernel void @v_udiv_i24(ptr addrspace(1) %out, ptr addrspace(1) %i
; GCN-LABEL: v_udiv_i24:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_add_u32 s4, s2, 4
; GCN-NEXT: s_addc_u32 s5, s3, 0
@@ -2105,6 +2138,9 @@ define amdgpu_kernel void @scalarize_mulhu_4xi32(ptr addrspace(1) nocapture read
; GCN-LABEL: scalarize_mulhu_4xi32:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s0
; GCN-NEXT: v_mov_b32_e32 v1, s1
@@ -2218,6 +2254,9 @@ define amdgpu_kernel void @test_udiv2(i32 %p) {
; GCN-LABEL: test_udiv2:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_lshr_b32 s0, s0, 1
; GCN-NEXT: v_mov_b32_e32 v0, s0
@@ -2281,6 +2320,9 @@ define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dword s0, s[8:9], 0x0
; GCN-NEXT: v_mov_b32_e32 v0, 0xaaaaaaab
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mul_hi_u32 v0, s0, v0
; GCN-NEXT: v_lshrrev_b32_e32 v0, 1, v0
@@ -2369,6 +2411,9 @@ define amdgpu_kernel void @fdiv_test_denormals(ptr addrspace(1) nocapture readon
;
; GCN-LABEL: fdiv_test_denormals:
; GCN: ; %bb.0: ; %bb
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: flat_load_sbyte v2, v[0:1]
; GCN-NEXT: v_mov_b32_e32 v0, 0
; GCN-NEXT: v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index 55cbc14a46706..97738a7944741 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -9,6 +9,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; SI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_mov_b32_e32 v1, s3
; SI-NEXT: v_add_i32_e32 v0, vcc, s2, v0
@@ -28,6 +31,9 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
; VI-NEXT: v_lshlrev_b32_e32 v0, 3, v0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_mov_b32_e32 v1, s3
; VI-NEXT: v_add_u32_e32 v0, vcc, s2, v0
@@ -54,6 +60,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
; SI-LABEL: s_uint_to_fp_i64_to_f64:
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -67,6 +76,9 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
; VI-LABEL: s_uint_to_fp_i64_to_f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s2
@@ -86,6 +98,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; SI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1
@@ -103,6 +118,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
; VI-LABEL: s_uint_to_fp_v2i64_to_v2f64:
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s1
@@ -128,6 +146,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x8
; SI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s3
; SI-NEXT: v_cvt_f64_u32_e32 v[4:5], s1
@@ -160,6 +181,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx8 s[0:7], s[8:9], 0x20
; VI-NEXT: s_load_dwordx2 s[8:9], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[2:3], s7
; VI-NEXT: v_cvt_f64_u32_e32 v[4:5], s5
@@ -196,6 +220,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; SI-NEXT: v_mov_b32_e32 v3, s1
@@ -207,6 +234,9 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; VI-NEXT: v_mov_b32_e32 v3, s1
@@ -222,6 +252,9 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
; GCN-LABEL: s_uint_to_fp_v2i32_to_v2f64:
; GCN: ; %bb.0:
; GCN-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GCN-NEXT: s_add_i32 s12, s12, s17
+; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13
+; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_cvt_f64_u32_e32 v[2:3], s3
; GCN-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
@@ -239,6 +272,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
; SI: ; %bb.0:
; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x4
; SI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; SI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3
@@ -259,6 +295,9 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
; VI: ; %bb.0:
; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x10
; VI-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
; VI-NEXT: v_cvt_f64_u32_e32 v[6:7], s3
@@ -286,11 +325,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; SI-NEXT: v_mov_b32_e32 v3, s1
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -300,11 +342,14 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -320,6 +365,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_bitcmp1_b32 s2, 0
; SI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -334,6 +382,9 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_bitcmp1_b32 s2, 0
; VI-NEXT: s_cselect_b64 s[2:3], -1, 0
@@ -353,6 +404,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_and_b32 s2, s2, 0xff
; SI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
@@ -365,6 +419,9 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_and_b32 s2, s2, 0xff
; VI-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
@@ -402,11 +459,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; SI-NEXT: v_mov_b32_e32 v3, s1
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -416,11 +476,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -453,11 +516,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; SI-NEXT: v_mov_b32_e32 v3, s1
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -467,11 +533,14 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0x3ff00000, 0
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -505,11 +574,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
; SI: ; %bb.0:
; SI-NEXT: s_load_dword s2, s[8:9], 0x2
; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; SI-NEXT: v_mov_b32_e32 v0, 0
+; SI-NEXT: s_add_i32 s12, s12, s17
+; SI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; SI-NEXT: s_mov_b32 flat_scratch_lo, s13
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_cmp_eq_u32 s2, 0
; SI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000
; SI-NEXT: v_mov_b32_e32 v3, s1
+; SI-NEXT: v_mov_b32_e32 v0, 0
; SI-NEXT: v_mov_b32_e32 v1, s2
; SI-NEXT: v_mov_b32_e32 v2, s0
; SI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
@@ -519,11 +591,14 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
; VI: ; %bb.0:
; VI-NEXT: s_load_dword s2, s[8:9], 0x8
; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0
-; VI-NEXT: v_mov_b32_e32 v0, 0
+; VI-NEXT: s_add_i32 s12, s12, s17
+; VI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; VI-NEXT: s_mov_b32 flat_scratch_lo, s13
; VI-NEXT: s_waitcnt lgkmcnt(0)
; VI-NEXT: s_cmp_eq_u32 s2, 0
; VI-NEXT: s_cselect_b32 s2, 0, 0x3ff00000
; VI-NEXT: v_mov_b32_e32 v3, s1
+; VI-NEXT: v_mov_b32_e32 v0, 0
; VI-NEXT: v_mov_b32_e32 v1, s2
; VI-NEXT: v_mov_b32_e32 v2, s0
; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
index 45ea6b62761cc..ab7e85fdff516 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll
@@ -11,7 +11,7 @@
define amdgpu_kernel void @__omp_offloading_16_dd2df_main_l9() {
; CHECK-LABEL: __omp_offloading_16_dd2df_main_l9:
; CHECK: ; %bb.0: ; %bb
-; CHECK-NEXT: s_add_u32 s0, s0, s15
+; CHECK-NEXT: s_add_u32 s0, s0, s17
; CHECK-NEXT: s_addc_u32 s1, s1, 0
; CHECK-NEXT: v_mov_b32_e32 v1, v0
; CHECK-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
index 883657547519b..7838852d8576c 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg-debug.ll
@@ -25,8 +25,9 @@
; CHECK-NEXT: argumentInfo:
; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; CHECK-NEXT: workGroupIDX: { reg: '$sgpr8' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' }
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
@@ -41,7 +42,7 @@
; CHECK-NEXT: BitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 10
; CHECK-NEXT: vgprForAGPRCopy: ''
-; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
+; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: body:
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
index 278bf086d6088..63b2b6b2fa01b 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-long-branch-reg.ll
@@ -25,8 +25,9 @@
; CHECK-NEXT: argumentInfo:
; CHECK-NEXT: privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
; CHECK-NEXT: kernargSegmentPtr: { reg: '$sgpr4_sgpr5' }
-; CHECK-NEXT: workGroupIDX: { reg: '$sgpr6' }
-; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr7' }
+; CHECK-NEXT: flatScratchInit: { reg: '$sgpr6_sgpr7' }
+; CHECK-NEXT: workGroupIDX: { reg: '$sgpr8' }
+; CHECK-NEXT: privateSegmentWaveByteOffset: { reg: '$sgpr9' }
; CHECK-NEXT: workItemIDX: { reg: '$vgpr0' }
; CHECK-NEXT: psInputAddr: 0
; CHECK-NEXT: psInputEnable: 0
@@ -41,7 +42,7 @@
; CHECK-NEXT: BitsOf32BitAddress: 0
; CHECK-NEXT: occupancy: 10
; CHECK-NEXT: vgprForAGPRCopy: ''
-; CHECK-NEXT: sgprForEXECCopy: '$sgpr100_sgpr101'
+; CHECK-NEXT: sgprForEXECCopy: '$sgpr98_sgpr99'
; CHECK-NEXT: longBranchReservedReg: '$sgpr2_sgpr3'
; CHECK-NEXT: hasInitWholeWave: false
; CHECK-NEXT: body:
More information about the llvm-commits
mailing list